#!/usr/bin/python
#
#	simulate data from CRM
#
#	Data structures:
#
#		CRM = ( logical_aspects , spatial_aspects );
#
#		logical_aspects = list of pairs:  (list-of-motif-disjuncts, parallel list-of-preference-probability)
#
#		spatial_aspects = (strand, distance, order)
#
#		strand = function(binding_site_index) -> probability
#		distance = function(binding_site_index_1, binding_site_index_2, sequence-length)  ->  function(distance) -> probability
#		order = function(upstream_binding_site_index, downstream_binding_site_index) -> probability
#
#	See `main' for program options (using optparse).
#


import sys, string, random, math, optparse, copy;

# CRM LOGICAL ASPECTS as list of (list-of-motifs, parallel list-of-prob)
CRM_logical = [ (["CGGATATCCG"],[1]) ,  (["ACTTnnGGGACTT"],[1]) ]; ###, (["GGGATTAGGG"],[1]) ];

def CRM_strand(index):
	d = [ 0.50 , 0.9 ]; ## , x.xx ];	# template prob's
	return d[index];

def CRM_distance(index1, index2, L):

	if (index1 > index2):
		tmp = index1;
		index1 = index2;
		index2 = tmp;

	#from end
	if (index1==0 and index2==0):  return lambda x : gaussian(L/2.5, L/2.5, x);
	if (index1==1 and index2==1):  return lambda x : gaussian(L/2.5, L/2.5, x);
	##if (index1==2 and index2==2):  return lambda x : gaussian(L/6.0, L/3.0, x);

	#pairwise
	if (index1==0 and index2==1):  return lambda x : gaussian(L/5.0, L/2.5, x);	
	##if (index1==0 and index2==2):  return lambda x : gaussian(L/6.0, L/3.0, x);
	##if (index1==1 and index2==2):  return lambda x : gaussian(L/6.0, L/3.0, x);

	raise "Error calling CRM_distance(index1=" + str(index1) + ", index2=" + str(index2) + ", L=" + str(L) + ").";

def CRM_order(index1, index2):

	if (index1==0 and index2==1):  return 0.9;
	##if (index1==0 and index2==2):  return 0.7;
	if (index1==1 and index2==0):  return 0.1;
	##if (index1==1 and index2==2):  return 0.3;
	##if (index1==2 and index2==0):  return 0.3;
	##if (index1==2 and index2==1):  return 0.7;

	return 0;

CRM_spatial = (CRM_strand, CRM_distance, CRM_order);
CRM = (CRM_logical, CRM_spatial);

# probability of BS at 'x'
def CRM_prior(location):
	PERIOD = 200;	# base pairs
	BASELINE = 0.30;
	return BASELINE + (1-BASELINE) * (1.0/2 + 1.0/2 * math.sin( ((2*math.pi)/PERIOD) * (location) ));



# normalize a distribution so that it sums to 1.0
def normalize(d):
	s = 0.0;
	for p in d:
		s = s + float(p);
	ans = [ ];
	for p in d:
		ans = ans + [ (float(p)/s) ];
	return ans;

# generate a positive sequence
def positive(crm, L, mm, PRIOR_DISTRIBUTION):

	A = PROB_OF_BS_BASE_MUTATION = 0.15;

	sequence = generate(L, mm);

	(structure, (strandf, distancef, orderf)) = crm;
	
	# choose an order
	orderings = gen_ordering(len(structure));
	order_prob = [ ];
	for order in orderings:
		op = 1.0;
		for up in range(0, len(order)-1):
			for down in range(up+1, len(order)):
				op = op * orderf(order[up], order[down]);
		order_prob = order_prob + [ op ];
	order = orderings[ select(normalize(order_prob)) ];


	# place motifs
	rev_order = copy.deepcopy(order);
	rev_order.reverse();

	anchor = len(sequence);	# anchor = current motif position
	last_site_index = rev_order[0];

	for site_index in rev_order:

		(motifs, motif_preference) = structure[site_index];

		motif = motifs[select(normalize(motif_preference))];

		# alter some bases
		for i in range(0,len(motif)):
				if ((not motif[i] in "ACGT") or (chance(A))):
					# choose a random base
					motif = motif[:i] + "acgt"[rand(4)] + motif[i+1:];

		# strand
		if (not chance(strandf(site_index))):
			# rev comp
			motif = revcomp(motif);


		possible_distances = range(0, anchor);
		dd = [ ];
		for possible_distance in possible_distances:
			p = 1;
			p = p * distancef(site_index, last_site_index, L)(possible_distance);
			p = p * PRIOR_DISTRIBUTION[anchor - possible_distance];
			dd = dd + [ p ];
		dd_index = select(normalize(dd));
		D = possible_distances[dd_index];

		anchor = anchor - D;

		# plant motif
		sequence = sequence[:anchor] + motif + sequence[anchor + len(motif):];

		last_site_index = site_index;
	
	return sequence;

def negative(crm, L, mm, PRIOR_DISTRIBUTION):

	if (chance(0.7)):
		# just a background sequence
		sequence = generate(L, mm);

	else:

		# create a single-motif version of the CRM
		
		(structure, (strandf, distancef, orderf)) = crm;

		site_index = rand(len(structure));

		MOTIF = (
					[ structure[site_index] ]  , 	
					(	
						lambda i1 : strandf(site_index), 
						lambda i1,i2,L : distancef(site_index, site_index, L),
						lambda i1,i2 : orderf(site_index, site_index)
					)
				);

		sequence = positive(MOTIF, L, mm, PRIOR_DISTRIBUTION);

	return sequence;

def main():
	
	(structure, (strandf, distancef, orderf)) = CRM;

	parser = optparse.OptionParser(usage="usage:  %prog [options] [Markov background file]");

	parser.add_option("-L", "--len", type="int", dest="L", default=500, action="store", help="Sequence length", metavar="length");
	parser.add_option("-P", "--P", type="int", dest="P", default=100, action="store", help="Number of positive sequences", metavar="pos");
	parser.add_option("-N", "--N", type="int", dest="N", default=100, action="store", help="Number of negative sequences", metavar="neg");
	parser.add_option("-q", "--quiet", dest="verbose", default=True, action="store_false", help="Quiet mode");
	(options, args) = parser.parse_args();
	
	PRIOR_DISTRIBUTION = normalize(map(CRM_prior, range(0,options.L+1)));

	if (args):
		mm = read_markov(open(args[0]));
	else:
		mm = {'a':0.25,'c':0.25,'g':0.25,'t':0.25};

	weights_out = open("W.pfa", "w");
	priors_out = open("V.pfa", "w");

						
	out = open("candidates.fa", "w");
	for (motifs , d) in structure:
		for motif in motifs:
			out.write(">\n" + motif + "\n");
	out.close();

	out = open("positive.fa", "w");
	if (options.verbose): sys.stderr.write("[\r" + (" " * options.P) + "]\r[");
	for s in range(0,options.P):
		header = "> +" + str(s);
		sequence = positive(CRM, options.L, mm, PRIOR_DISTRIBUTION);
		out.write(header + "\n" + sequence + "\n");
		priors_out.write(header + "\n");
		for x in range(0,options.L):
			priors_out.write(str(PRIOR_DISTRIBUTION[x]) + " ");
		priors_out.write("\n");
		weights_out.write(header + "\n" + "1" + "\n");
		if (options.verbose): sys.stderr.write("+");
	out.close();
	if (options.verbose):	sys.stderr.write("\r" + (' ' * (options.P + 2)) + "\r");

	out = open("negative.fa", "w");
	if (options.verbose):
		sys.stderr.write("[\r" + (" "*options.N) + "]\r[");
	for s in range(0,options.N):
		header = "> -" + str(s);
		sequence = negative(CRM, options.L, mm, PRIOR_DISTRIBUTION);
		out.write(header + "\n" + sequence + "\n");
		priors_out.write(header + "\n");
		for x in range(0,options.L):
			priors_out.write(str(PRIOR_DISTRIBUTION[x]) + " ");
		priors_out.write("\n");
		weights_out.write(header + "\n" + "1" + "\n");
		if (options.verbose): sys.stderr.write("-");
	out.close();
	if (options.verbose):	sys.stderr.write("\r" + (' ' * (options.N + 2)) + "\r");

	priors_out.close();
	weights_out.close();


def gaussian(mu, sigma, x):
	return (1.0 / (sigma * math.sqrt(2 * math.pi))) * math.exp( - ((float(mu)-x)*(mu-x)) / (sigma*sigma) );

def rnd():
	return random.random();

def rand(limit):
	return int(rnd()*limit);

def chance(p):
	return (rnd() < p);



# probabilistically select an index from the distribution, d
def select(d):
	total = 0.0;
	for p in d:
		total = total + p;
	for i in range(0,len(d)):
		if (random.random() < (d[i]/total)):
			return i;
		else:
			total = total - d[i];
	print "WARNING:  select(" + str(d) + ").";
	return -1;


# generate a DNA sequence of length L from 
#	a Markov chain distribution mm (default==uniform)
def generate(L, mm={'a':0.25,'c':0.25,'g':0.25,'t':0.25}):

	alphabet = 'acgt';

	order = int( math.log(len(mm)) / math.log(len(alphabet)) - 1);	
	ans = "";
	while (len(ans) < L):
		markov = ans[:order];
		d = [ ];
		for letter in alphabet:
			d = d + [ mm[(markov+letter).lower()] ];
		ans = ans + alphabet[select(d)];
	return ans;
	
# Read a markov file, return e.g. { "acggt" : 0.324232 , ... }
def read_markov(input):
	ans = { };
	for line in input:
		split = line.split();
		if (line[0] != "#" and split[0][0] != "#"):
			ans[split[0].lower()] = float(split[1]);
	return ans;


# generate
def gen_ordering(size):
	if (size == 0):
		return [ ];
	if (size == 1):
		return [[0]];
	if (size > 1):
		ans = [ ];
		suborderings = gen_ordering(size-1);
		for ordering in suborderings:
			new_ordering = [ ];
			for i in range(0, len(ordering)+1):
				ans = ans + [ ordering[:i] + [size-1] + ordering[i:] ];
		return ans;

# could fix this up
def revcomp(motif):
	ans = ""
	for letter in motif:
		if (letter=='a'):  ans = "t" + ans;
		elif (letter=='c'):  ans = "g" + ans;
		elif (letter=='g'):  ans = "c" + ans;
		elif (letter=='t'):  ans = "a" + ans;
		elif (letter=='A'):  ans = "T" + ans;
		elif (letter=='C'):  ans = "G" + ans;
		elif (letter=='G'):  ans = "C" + ans;
		elif (letter=='T'):  ans = "A" + ans;
		else:  ans = "n" + ans;
	return ans;

if (__name__ == '__main__'):
	main();





