#include "CRM.h"
#include "grid.h"
#include "custom.h"

#include <iostream>
#include <fstream>
#include <iterator>
using namespace std;

/** init/define static member vars (parameters) */

probability CRM::magic_ratio1 = 1;	// sites = 1, if P(BG)/P(CRM) < M.R., I'll skip eval of this motif instance
probability CRM::magic_ratio2 = 1;	// sites > 1, if P(BG)/P(CRM) < M.R., I'll skip eval of this motif instance

probability CRM::MDA = 1;			// Max. Distance Area:  How much of the distance area should be considered (ignore the tail end after this much of the mass)

unsigned int CRM::MAXL = 0;	// Distance range (0 through MAXL-1)
unsigned int CRM::BINW = 1;	// bin width (bin contiguous distances)
double CRM::GammaBeta = 0;	// zero <=> No gamma, use uniform distribution
unsigned int CRM::FPP = 10;	// Default File Parameter Precsion = this many significant digits.

unsigned int CRM::MINIMUM_IO_ITERATIONS = 0;
unsigned int CRM::MAXIMUM_IO_ITERATIONS = ((unsigned int)-1);

void CRM::dump_order(ostream &out) const {
	
	for (unsigned int i=0; i<this->size(); i++) { 
	for (unsigned int j=i+1; j<this->size(); j++) { 

		out << "Binding Site #" << (i+1) << " upstream of #" << (j+1) << ":  "
			<< ( this->order(i,j) / ( this->order(i,j) + this->order(j,i) ) )
			<< endl;

	}}
}


void CRM::dump_distance(ostream &out) const {

	for (unsigned int i=0; i<this->size(); i++) { 
	for (unsigned int j=i; j<this->size(); j++) { 

		out << endl << endl << "#   Binding Site #" << (i+1) << " to ";
		if (i==j) { 
			out << "Downstream (right) end of sequence" << endl;
			this->distance(i).dump(out);
		} else { 	
			out << "Binding Site #" << (j+1) << endl;
			this->distance(i,j).dump(out);
		}

	}}
}

void CRM::print(ostream &out) const {

	for (unsigned int s=0; s<size(); s++) { 	
	
		probability ptmpl = 
				site(s).strand_preference(BindingSite::TMPL) / 
			(site(s).strand_preference(BindingSite::TMPL)  +  site(s).strand_preference(BindingSite::TCX));

		if (site(s).negated) { out << "Negated "; }
		out << "Binding Site #" << (s+1) << " (template " << ptmpl << ")" << endl;
		probability sum = 0;
		for (unsigned int m=0; m<site(s).multiplicity(); m++) { sum += site(s).motif_preference(m); }
		for (unsigned int m=0; m<site(s).multiplicity(); m++) { 
			out << "  " << "Motif #" << (m+1)
				<< " (motif preference " << site(s).motif_preference(m)/sum << ")"
				<< " Consensus " << site(s).motif(m).consensus() << " "
				<< " MLE " << site(s).motif(m).mle() << " " 
				<< endl;
			site(s).motif(m).print(out, "   ");	// could change precision of PWM entries
		}
	}

	out << endl;

	// dump order
	this->dump_order(out);

	// summarize distances
	for (unsigned int i=0; i<size(); i++) { 
	for (unsigned int j=i; j<size(); j++) { 
		out << "Distance #" << (i+1) << " <-> ";
		if (i==j) {
			out << "END as "
				<< distance(i) << endl;
		} else {
			out << "#" << (j+1) << " as "
				<< distance(i,j) << endl;
		}
	}}

}
ostream &operator<<(ostream &out, const CRM &crm) { crm.print(out); return out; }
 
void CRM::summarize(ostream &out) const {

	for (unsigned int i=0; i<this->size(); i++) { 
		
		if (i > 0) { out << " & "; }
		out << "(";

		for (unsigned int m=0; m< this->site(i).multiplicity(); m++) { 
			if (m > 0) { out << " | "; }
			if (site(i).negated) { out << "~"; }	// this will be my negation symbol
		
			// either mle() or consensus() here
			// out << site(i).motif(m).mle();
			out << site(i).motif(m).consensus();
		
		}

		out << ")";
	
	}
}













CRM CRM::structure(probability value) const {

	// copy structure (and fill with value)
	CRM ans;

	// I'm not worried about two steps, copy & fill for binding sites, only 
	//	pairwise distance parameters
	ans.binding_sites = this->binding_sites;
	for (unsigned int i=0; i<ans.size(); i++) { ans.site(i).fill(value); }

	ans.pairwise_preferences.resize( this->pairwise_preferences.size() );
	for (unsigned int i=0; i<this->pairwise_preferences.size(); i++) { 
		
		ans.pairwise_preferences[i].resize( this->pairwise_preferences[i].size() );
		
		for (unsigned int j=0; j<this->pairwise_preferences[i].size(); j++) { 
			ans.pairwise_preferences[i][j].first = value;	// order
		}

		for (unsigned int j=i; j<this->pairwise_preferences[i].size(); j++) {
			// distance (only when i<=j)
			ans.pairwise_preferences[i][j].second.resize( 
				this->pairwise_preferences[i][j].second.size(),
				this->pairwise_preferences[i][j].second.binw()
			);	
			ans.pairwise_preferences[i][j].second.fill(value);
		}

	}

	#if CRM_DEBUG
	assert(this->size() == ans.size());
	assert(this->pairwise_preferences.size() == ans.pairwise_preferences.size());
	#endif

	return ans;
}
	
				
					
void BindingSite::normalize() { 
	
	probability sum;

	sum = strand[0] + strand[1];
	if (sum != 0) {
		strand[0] /= sum;
		strand[1] /= sum;
	}	// if sum == 0, leave strand[x] = 0;

	sum = 0;
	for (unsigned int m=0; m<size(); m++) { sum += motif_preference(m); }
	if (sum != 0) { for (unsigned int m=0; m<size(); m++) { motif_preference(m) /= sum; } }
	
	for (unsigned int m=0; m<size(); m++) { motif(m).normalize(); }

}
	
void BindingSite::pseudocount(const probability &pc) {
	
	strand[0] += pc;
	strand[1] += pc;
	for (unsigned int m=0; m<size(); m++) { 
		motif_preference(m) += pc;
		motif(m).pseudocount(pc);
	}
}
	
void BindingSite::fill(const probability &value) {
	
	strand[0] = value;
	strand[1] = value;
	for (unsigned int m=0; m<size(); m++) { 
		motif_preference(m) = value;
		motif(m).fill(value);
	}
}

void BindingSite::add(const BindingSite &addend, const probability &weight) { 

	#if CRM_DEBUG
	if (!(this->multiplicity() == addend.multiplicity())) {
		cerr << "BindingSite has " << (this->multiplicity()) 
			 << " motifs, addend has " << (addend.multiplicity()) 
			 << endl; 
	}
	assert(this->multiplicity() == addend.multiplicity());
	#endif

	this->strand_preference(TMPL) += weight * addend.strand_preference(TMPL);
	this->strand_preference(TCX) +=weight *  addend.strand_preference(TCX);

	for (unsigned int m=0; m < this->multiplicity(); m++) { 

		this->motif_preference(m) += weight * addend.motif_preference(m);

		#if CRM_DEBUG
		assert(this->motif(m).width() == addend.motif(m).width());
		#endif
		
		this->motif(m).add(addend.motif(m), weight);

	}

}

void CRM::fill(const probability &value) { 

	for (unsigned int i=0; i<this->size(); i++) { 
		this->site(i).fill(value);
		this->distance(i).fill(value);
		for (unsigned int j=i+1; j<this->size(); j++) { distance(i,j).fill(value); }
		for (unsigned int j=0; j<this->size(); j++) { order(i,j) = value; }
	}
}

void CRM::pseudocount(const probability &pc) { 

	for (unsigned int i=0; i<this->size(); i++) { 
		this->site(i).pseudocount(pc);
		this->distance(i).pseudocount(pc);
		for (unsigned int j=i+1; j<this->size(); j++) { 
			this->distance(i,j).pseudocount(pc);
		}
		for (unsigned int j=0; j<this->size(); j++) { 
			this->order(i,j) += pc;
		}
	}

}


/**
 * Add parameters of two CRMs
 */
void CRM::add(const CRM &addend, const probability &weight, unsigned int sitemap) {

	sitemap &= (1 << this->size()) - 1;		// limit bitmap ones by size of CRM

	#if CRM_DEBUG
	unsigned int ones = 0; for (unsigned int map=sitemap; map > 0; map = map >> 1) { ones += (map & 1); }
	if (addend.size() != ones) { 
		cerr << "sitemap = " << sitemap << endl;
		cerr << "ones = " << ones << endl;
		cerr << "addend.size() = " << addend.size() << endl;
	}
	assert(addend.size() == ones);
	assert(addend.size() <= this->size());
	#endif

	vector<unsigned int> map;	// map[subset_crm_site] -> original_crm_site
	for (unsigned int i=0; i<this->size(); i++) { 
		if ((sitemap >> i) & 1) { 
			map.push_back(i); // site i is present
		}
	}


	for (unsigned int ai=0; ai<addend.size(); ai++) { 	// addend binding site index

		const unsigned int myi = map[ai];	// this's corresponding binding site
		
		this->site(myi).add(addend.site(ai), weight);
		
		this->distance(myi).add(addend.distance(ai), weight);

		for (unsigned int aj=ai+1; aj<addend.size(); aj++) { 
			const unsigned int myj = map[aj];
			this->distance(myi,myj).add(addend.distance(ai,aj), weight);
		}

		for (unsigned int aj=0; aj<addend.size(); aj++) { 
			const unsigned int myj = map[aj];
			this->order(myi,myj) += weight * addend.order(ai,aj);
		}
		
	}

} // CRM::add(...)


/** Return True iff minuend > 0, but subtracting subtrahend makes it <= zero */
bool subtract_zerocheck(probability &store, const probability &minuend, const probability &subtrahend) {

	const probability epsilon = pow(2.0,-500.0);
	
	// store = minuend - subtrahend
	if ( subtrahend != 0 ) { 
		if ( minuend < subtrahend ) { 
			store = epsilon;
			return false;
		} else {
			store = ( minuend - subtrahend );
		}
	}
	return true;
}

// this <- (this - weight * subtrahend), but store zeros and return false if 
//	a nonzero parameter falls below zero as a result
bool CRM::subtract(const CRM &subtrahend, const probability &weight, unsigned int sitemap) {

	sitemap &= (1 << this->size()) - 1;		// limit bitmap ones by size of CRM

	#if CRM_DEBUG
	unsigned int ones = 0; for (unsigned int map=sitemap; map > 0; map = map >> 1) { ones += (map & 1); }
	assert(subtrahend.size() == ones);
	assert(subtrahend.size() <= this->size());
	#endif

	bool ans = true;
	
	vector<unsigned int> map;	// map[subset_crm_site] -> original_crm_site
	for (unsigned int i=0; i<this->size(); i++) { 
		if ((sitemap >> i) & 1) { 
			map.push_back(i); // site i is present
		}
	}


	for (unsigned int si=0; si<subtrahend.size(); si++) { 	// subtrahend index

		const unsigned int myi = map[si];	// my corresponding index
		
		// Subtract binding sites
		#if CRM_DEBUG
		assert(this->site(myi).multiplicity() == subtrahend.site(si).multiplicity());
		#endif

		ans = ans && subtract_zerocheck( this->site(myi).strand_preference(BindingSite::TMPL) , 
										 this->site(myi).strand_preference(BindingSite::TMPL) , 
									 	 weight * subtrahend.site(si).strand_preference(BindingSite::TMPL) );
		ans = ans && subtract_zerocheck( this->site(myi).strand_preference(BindingSite::TCX) , 
										 this->site(myi).strand_preference(BindingSite::TCX) , 
										 weight * subtrahend.site(si).strand_preference(BindingSite::TCX) );

		for (unsigned int m=0; m < this->site(myi).multiplicity(); m++) {

			ans = ans && subtract_zerocheck( this->site(myi).motif_preference(m) , 
											 this->site(myi).motif_preference(m) ,
											 weight * subtrahend.site(si).motif_preference(m) );

			#if CRM_DEBUG
			assert(this->site(myi).motif(m).width() == subtrahend.site(si).motif(m).width());
			#endif
		
			for (unsigned int a=0; a<this->site(myi).motif(m).width(); a++) { 
			for (unsigned int b=0; b<DNA_ABLEN; b++) { 
				ans = ans && subtract_zerocheck( this->site(myi).motif(m)(a,b), 
				                                 this->site(myi).motif(m)(a,b),
												 weight * subtrahend.site(si).motif(m)(a,b) );
			}}
		}


		ans = ans && this->distance(myi).subtract(subtrahend.distance(si), weight);

		for (unsigned int sj = si + 1; sj < subtrahend.size(); sj++) { 

			const unsigned int myj = map[sj];

			ans = ans && this->distance(myi,myj).subtract(subtrahend.distance(si,sj), weight); 
			
		}


		for (unsigned int sj=0; sj<subtrahend.size(); sj++) { 
			if (sj == si) { continue; }	// no need to get a zero and start over when order(i,i) is meaningless
			const unsigned int myj = map[sj];
			ans = ans && subtract_zerocheck( this->order(myi,myj), this->order(myi,myj), 
			                         	     weight * subtrahend.order(si,sj) );
		}
		
	} // next site 

	return ans;

}


/*
bool CRM::subtract(const CRM &subtrahend, const probability &weight, unsigned int sitemap) {
	// (See CRM::add for explanation of these variables, skip_negation, sitemap, more...)
	vector<unsigned int> sitemap = create_sitemap(*this, subtrahend);

	bool ans = true;

	for (unsigned int i=0; i<this->size(); i++) {

		if (sitemap[i] >= this->size()) { continue; }

		// Subtract binding sites
		#if CRM_DEBUG
		assert(this->site(i).multiplicity() == subtrahend.site(sitemap[i]).multiplicity());
		#endif

		ans = ans && subtract_zerocheck( this->site(i).strand_preference(BindingSite::TMPL) , 
										 this->site(i).strand_preference(BindingSite::TMPL) , 
									 	 weight * subtrahend.site(sitemap[i]).strand_preference(BindingSite::TMPL) );
		ans = ans && subtract_zerocheck( this->site(i).strand_preference(BindingSite::TCX) , 
										 this->site(i).strand_preference(BindingSite::TCX) , 
										 weight * subtrahend.site(sitemap[i]).strand_preference(BindingSite::TCX) );

		for (unsigned int m=0; m < this->site(i).multiplicity(); m++) {

			ans = ans && subtract_zerocheck( this->site(i).motif_preference(m) , 
											 this->site(i).motif_preference(m) ,
											 weight * subtrahend.site(sitemap[i]).motif_preference(m) );

			#if CRM_DEBUG
			assert(this->site(i).motif(m).width() == subtrahend.site(sitemap[i]).motif(m).width());
			#endif
		
			for (unsigned int a=0; a<this->site(i).motif(m).width(); a++) { 
			for (unsigned int b=0; b<DNA_ABLEN; b++) { 
				ans = ans && subtract_zerocheck( this->site(i).motif(m)(a,b), 
				                                 this->site(i).motif(m)(a,b),
												 weight * subtrahend.site(sitemap[i]).motif(m)(a,b) );
			}}
		}


		ans = ans && this->distance(i).subtract(subtrahend.distance(sitemap[i]), weight);

		for (unsigned int j=i+1; j<this->size(); j++) { 
			if (sitemap[j] >= this->size()) { continue; }
			ans = ans && this->distance(i,j).subtract(subtrahend.distance(sitemap[i],sitemap[j]), weight); 
			
		}


		for (unsigned int j=0; j<this->size(); j++) { 
			if (sitemap[j] >= this->size()) { continue; }
			ans = ans && subtract_zerocheck( this->order(i,j), this->order(i,j), 
			                         	     weight * subtrahend.order(sitemap[i],sitemap[j]) );
		
		}
		
	} // next site 

	return ans;
}*/


void CRM::cat(const CRM &peer) { 

	const unsigned int M = this->size();	// original size
	const unsigned int N = this->size() + peer.size();	// new size

	this->binding_sites.reserve(N);
	for (vector<BindingSite>::const_iterator s=peer.binding_sites.begin(); 
											 s!=peer.binding_sites.end(); s++) { 
		this->binding_sites.push_back(*s);
	}

	this->pairwise_preferences.resize(N);
	for (unsigned int i=0; i<N; i++) { this->pairwise_preferences[i].resize(N); }
	

	for (unsigned int i=0; i<this->size(); i++) { 
	for (unsigned int j=0; j<this->size(); j++) {

		if (i < M && j < M ) { 
			// nothing to do:  already my information
		} else if (i >= M && j >= M) {
			// both peer's binding sites
			if (i==j) { 
				this->distance(i) = peer.distance(i - M);
			} else {
				this->distance(i,j) = peer.distance(i - M, j - M);
			}
			this->order(i,j) = peer.order(i-M, j-M);
		} else {
			// this is a never before seen relationship:  make it a uniform distribution
			this->order(i,j) = 1;
			if (i==j) { 
				reset(this->distance(i));
				this->distance(i).fill(1);
				this->distance(i).normalize();
			} else if (i<j) {
				reset(this->distance(i,j));
				this->distance(i,j).fill(1);
				this->distance(i,j).normalize();
			}
		}

	}} // next pair

}


		


void CRM::normalize() { 

	for (unsigned int i=0; i<this->size(); i++) { 

		this->site(i).normalize();
		this->distance(i).normalize();

		for (unsigned int j=i+1; j<this->size(); j++) { 
			this->distance(i,j).normalize();
		}

		// normalizing order is necessary for adding normalized models
		for (unsigned int j=0; j<this->size(); j++) { 
			probability sum = 0;
			sum += this->order(i,j);
			sum += this->order(j,i);
			if (sum > 0) { 
				this->order(i,j) /= sum;
				this->order(j,i) /= sum;
			}
		}

	}
	#if CRM_DEBUG
	assert(this->check());
	#endif

}

	
		

void CRM::remove_site(unsigned int index) { 
	
	binding_sites.erase( binding_sites.begin() + index );

	for (unsigned int i=0; i<pairwise_preferences.size(); i++) { 
		pairwise_preferences[i].erase( pairwise_preferences[i].begin() + index );
	}
	pairwise_preferences.erase( pairwise_preferences.begin() + index );

	#if CRM_DEBUG
	assert(this->check());
	#endif

}


vector<CRM> CRM::powerset() const { 

	vector<CRM> ans;

	for (unsigned int sitemap = 0; sitemap < ((unsigned int)(1 << this->size())); sitemap++) { 
		// sitemap from all zeros to all ones
		ans.push_back(this->subset(sitemap));	// note that vector index == site map
	}

	return ans;
}

/** 
 * Create a CRM consisting of a subset of the sites in this CRM
 *	(with pairwise parameters set up accordingly)
 * sitemap:  a integer indicating which sites should be present
 *	(least sig. bit <=> site index 0)
 */
CRM CRM::subset(unsigned int sitemap) const { 

	CRM ans;	// start with empty

	vector<unsigned int> map;	// map[subset_crm_site] -> original_crm_site

	for (unsigned int i=0; i<this->size(); i++) { 
		if ((sitemap >> i) & 1) { 
			map.push_back(i); // site i is present
		}
	}

	// copy binding sites over
	for (unsigned int new_site_index=0; new_site_index < map.size(); new_site_index++) { 

		ans.binding_sites.push_back( this->binding_sites[map[new_site_index]] );
	
	}

	// copy pairwise parameters over	
	ans.pairwise_preferences.resize(ans.size());
	for (unsigned int i=0; i<ans.size(); i++) { 
			
		ans.pairwise_preferences[i].resize(ans.size());
	
		for (unsigned int j=0; j<ans.size(); j++) { 	
		
			ans.pairwise_preferences[i][j] = 
				this->pairwise_preferences[ map[i] ][ map[j] ];

		}

	}
	return ans;
}



// DEBUGGING function:  Check integrity of CRM data structure
#if CRM_DEBUG
bool CRM::check() const {
	
	bool ans = true;

	if ( this->size() != this->pairwise_preferences.size() ) { cerr << "C#1\n"; ans = false; }
	for (unsigned int i=0; i<this->size(); i++) { 
		if (this->pairwise_preferences[i].size() != this->size()) { cerr << "C#2\n"; ans = false; }
		
		for (unsigned int j=i; j<this->size(); j++) { 	
			// check distances (for i >= j), make sure size is right;
			
			if (this->pairwise_preferences[i][j].second.size() < CRM::MAXL+1) { 
				cerr << "C#3 (CRM::MAXL+1 = " << (CRM::MAXL+1) 
					 << ", size=" << (this->pairwise_preferences[i][j].second.size()) 
					 << ")\n";
				ans = false; 
			}
			
			if (this->pairwise_preferences[i][j].second.binw() != CRM::BINW) {
				cerr << "C#4\n"; 
				ans = false; 
			}
			
		} // next distance pair i,j
	} // next B.S.
	return ans;

}
#endif



// value (mass) of a Gamma(alpha,beta) distrbution at `x'
double gamma(const double alpha, const double beta, const double x) { 

	#ifndef PI
	#define PI 3.14159265358979323851280895940618620443274267017841339111328125
	#endif
    static const double DELTA = 1.0E-13;
    static const double LANCZ_CUTOFF =   700.0;

	const double lambda = 1.0 / beta;

	double fval;
	
	if (x < DELTA) {
		fval = 0.0;
	} else {
		if (alpha < LANCZ_CUTOFF) {
			double p = alpha;
			double x, tmp, ser;
			x = p;
			tmp = x + 5.5;
			tmp = tmp - (x + .5) * logl(tmp);
			ser = 1.000000000190015 + 76.18009172947146 / ( p + 1.0 );
			ser -= 86.50532032941678 / ( p + 2.0 );
			ser += 24.01409824083091 / ( p + 3.0 );
			ser -= 1.231739572450155 / ( p + 4.0 );
			ser += .001208650973866179 / ( p + 5.0 );
			ser -= 5.395239384953E-06 / ( p + 6.0 );
			fval = (logl(2.506628274631001 * ser / x) - tmp);
		} else {
			fval = ( 0.5 * logl(2.0 * PI) + (0.5 + alpha) * logl(alpha) - alpha );
		}
		fval = exp( alpha * log(lambda) - fval + (alpha - 1.0) * log(x) - lambda * x );
	}
	return (fval);
}




// easiest to assume we're adding at END of list (since order doesn't matter)
void CRM::add_site(const BindingSite &new_site) { 

	binding_sites.push_back(new_site);
	
	pairwise_preferences.resize(this->size());
	for (unsigned int i=0; i<this->size(); i++) { 
		pairwise_preferences[i].resize(this->size());
	}

	Distance *distance;

	// set initial distance distribution (new site -> end)
	distance = &(this->distance(this->size()-1));	// shorthand
	reset(*distance);	// (resize)
	if (CRM::GammaBeta > 0) {
		// we want to use a Gamma distribution to initialize this 
		distance->fill(0);
		for (unsigned int x=0; x<distance->size(); x++) { 
			distance->add(x, gamma(2.0, CRM::GammaBeta, x));
		}
	} else {
		distance->fill(1);
	}
	distance->normalize();
	distance = NULL;
	
	for (unsigned int i=0; i<this->size()-1; i++) { 	// for every *other* binding site (not the last one)
		
		this->order(i,this->size()-1) = this->order(this->size()-1,i) = 0.5;
		
		// set initial distance distribution (new site <-> other site)
		distance = &(this->distance(i,this->size()-1));	// shorthand
		reset(*distance);	// (resize)
		if (CRM::GammaBeta > 0) {
			distance->fill(0);
			for (unsigned int x=0; x<distance->size(); x++) { 
				distance->add(x, gamma(2.0, CRM::GammaBeta, x));
			}
		} else {
			distance->fill(1);
		}
		distance->normalize();
		distance = NULL;
	}


	#if CRM_DEBUG
	assert(this->check());
	#endif

}
		
		
void CRM::add_site(const PWM<probability> &pwm) {

	BindingSite new_site;

	new_site.insert(0, pwm);
	
	new_site.motif_preference(0) = (probability)1;

	new_site.strand_preference(BindingSite::TMPL) 
	= new_site.strand_preference(BindingSite::TCX) 
	= ((probability)(1.0 / BindingSite::NUM_STRANDS));

	this->add_site(new_site);

	#if CRM_DEBUG
	assert(this->check());
	#endif


}



void CRM::add_motif(unsigned int index, const PWM<probability> &pwm) {

	site(index).insert(site(index).multiplicity(), pwm);
	site(index).normalize();

	#if CRM_DEBUG
	assert(this->check());
	#endif
}



unsigned int CRM::negated() const { 

	unsigned int ans = 0;

	for (vector<BindingSite>::const_iterator i=this->binding_sites.begin();
	                                         i!=this->binding_sites.end(); i++) { 
		if (i->negated) { ans++; }
	}
	return ans;
}



list<vector<unsigned int> > enum_order(unsigned int n)  {

	list<vector<unsigned int> > ans;

	if (n==0) { 
		// return empty `ans'
	} else if (n==1) { 
		// build an answer of one order (n > 1 will build on this)
		ans.push_back(vector<unsigned int>(1));
		ans.back()[0] = 0;
	} else {
	
		/*
		//// NO ORDERING:  only first order, 0,1,...SITES-1
		ans.push_back(vector<unsigned int>(n));
		for (unsigned int i=0; i<n; i++) { ans.back()[i] = i; }
		*/

		list<vector<unsigned int> > pans = enum_order(n-1);
		for (list<vector<unsigned int> >::iterator i=pans.begin(); i!=pans.end(); i++) { 
			for (unsigned int j=0; j<n; j++) { 
				vector<unsigned int> c = *i;
				for (unsigned int k=0; k<c.size(); k++) { 
					if (c[k] >= j) { c[k]++; }
				}
				c.push_back(j);
				ans.push_front(c);
			}
		}

	}

	return ans;


}



/** 
 *	Return a list of orders and prior order probabilities, based on normalizing the product of 
 *	all partial orders 
 */
list<pair<vector<unsigned int>, probability> > order_priors(const CRM &crm, const list<vector<unsigned int> > &orders) { 

	list<pair<vector<unsigned int>, probability> > ans;
	probability sum = 0;
	for (list<vector<unsigned int> >::const_iterator itr=orders.begin(); itr!=orders.end(); itr++) { 

		const vector<unsigned int> *order = &(*itr);
		probability P = 1;
		for (unsigned int i=0; i<crm.size(); i++) { 
		for (unsigned int j=i+1; j<crm.size(); j++) { 

			#if CRM_DEBUG
			assert(!isnan(crm.order( (*order)[i], (*order)[j] )));
			#endif
		
			P *= crm.order( (*order)[i], (*order)[j] );

		}}
		sum += P;
		ans.push_back(pair<vector<unsigned int>,probability>(*itr, P));
	}

	for (list<pair<vector<unsigned int>, probability> >::iterator itr=ans.begin(); itr!=ans.end(); itr++) { 

		itr->second /= sum;

	}

	// alter the list
	constrain_order(ans, crm);

	return ans;
}
			





/** Add likelihood for all values of j */
struct inside_calculator { 
  
  public:
  	
	vector<DPCell> *alpha;
  	
	inside_calculator(vector<DPCell> *p = NULL) {
	
		alpha = p;
	}
	
  	void operator()(unsigned int i, unsigned int j, unsigned int m_IGNORED, unsigned int r_IGNORED, 
	                	const probability &v_IGNORED, const probability &z) { 

		#if CRM_DEBUG
		assert(alpha);
		const unsigned int L = alpha->size() - 1;
		assert(i <= L);
		if (isnan(z) || !((z <= ((probability)1)))) { cerr << "z = " << z << endl; }
		assert(!isnan(z));
		assert(z <= ((probability)1));
		#endif

		((*alpha)[i]).q += z;	// update `i' because this is an inside calculation

		// update traceback pointer to maximum-likelihood 'j' value.
		if ( z > ((*alpha)[i]).z ) { 
			((*alpha)[i]).z = z;
			((*alpha)[i]).traceback = j;
		}

	}

}; // inside calculator





/**
 * Print a CRM Location 
 */
ostream& operator<<(ostream &out, const CRMLocation &location) { 
	// NOTE:
	//	location numbers (binding site index, sequence location index)
	//	are zero-origin of course, but this function writes them out
	//	as both being 1-origin numbers.
	for (list<pair<unsigned int, unsigned int> >::const_iterator i=location.map.begin(); i!=location.map.end(); i++) { 
		out << (1+i->first) << "@" << (1+i->second) << "\t";
	}
	return out;
}


/**
 * Calculate the likelihood of each of a list of sequences given the CRM model
 */
vector<probability> likelihood(const CRM &crm, 
								const vector<Example*> &examples, 
								vector<CRMLocation> *locations) {


	if (locations) { locations->clear(); }
		
	const DPCell blank_DPCell(0,0,0);

	vector<vector<DPCell> > alpha(crm.size());
	
	const list<vector<unsigned int> > orders = enum_order(crm.size());
	const list<pair<vector<unsigned int>, probability> > priors = order_priors(crm, orders);

	vector<probability> ans;

	for (vector<Example*>::const_iterator x=examples.begin(); x!=examples.end(); x++) {
		
		const Example *example = *x;

		const unsigned int L = example->sequence()->length();

		// resize all matrices to appropriate size
		for (vector<vector<DPCell> >::iterator a_itr=alpha.begin(); a_itr!=alpha.end(); a_itr++) { 
			a_itr -> resize(L+1, blank_DPCell);
		}
		
		probability example_likelihood = 0;	// likelihood of *example = sum of P(sequence|order)

		if (locations) { locations->push_back(CRMLocation(0)); }	// append empty location

		// special case:  if CRM is empty (hence zero orderings), path likelihood is given by background
		if (!crm.size()) { example_likelihood = example->bg(0,L); }

		for (list<pair<vector<unsigned int>, probability> >::const_iterator itr=priors.begin();
																  itr!=priors.end(); itr++) {
				
			if (itr->second == 0) { continue; }	// save (a lot of?) time if ordering is impossible
			
			const vector<unsigned int> *order = &(itr->first);
			const probability order_prior = itr->second;
		
			for (unsigned int i=0; i<crm.size(); i++) {

				vector<DPCell> *alpha_in = (i==0) ? NULL : &(alpha[i-1]);
				vector<DPCell> *alpha_out = &(alpha[i]);
			
				const unsigned int this_bs_index = (*order)[crm.size() - 1 - i];
				const unsigned int downstream_bs_index = (i==0) ? ((unsigned int)-1) : (*order)[crm.size() - i];

				std::fill(alpha_out->begin(), alpha_out->end(), blank_DPCell);
				inside_calculator IC(alpha_out);

				inside_outside(&(crm.site(this_bs_index)), 		// BindingSite
							   example, 						// example
							   alpha_in, 						// inside
							   NULL,							// outside 
							   (i==0) ? &(crm.distance(this_bs_index)) : &(crm.distance(this_bs_index,downstream_bs_index)),
							   false,	//allow overlap
							   (crm.size() == 1 ? CRM::magic_ratio1 : CRM::magic_ratio2),
							   IC);

			}

			vector<DPCell> *alpha_final = &( alpha.back() );

			// I've calculated the probability of sequence[j..L) for all j (given the order)
			//	Now, multiply each by likelihood of sequence[0..j), and sum.
			probability P = 0;
			for (unsigned int j=0; j<=L; j++) { 
				P += example->bg(0,j) * ((*alpha_final)[j].q);
			}


			probability order_likelihood = P * order_prior;	// `order_prior' is the contribution of this ordering


			// TRACEBACK STEP:  
			if (locations && (order_likelihood > locations->back().likelihood)) { 

				// scan alpha[lastsite] for ML
				probability best_seqprob = 0;
				unsigned int best_i = 0;

				for (unsigned int i=0; i<=L; i++) { 
					probability seqprob = example->bg(0,i) * (*alpha_final)[i].z; 	// background [0...L) * model [i...L) = total sequence liklihood.
					if (seqprob > best_seqprob) { 
						best_seqprob = seqprob;
						best_i = i;
					}
				}

				// traceback
				locations->back().map.clear();
				unsigned int j = best_i;	// start of traceback
				for (unsigned int order_index=0; order_index<crm.size(); order_index++) { 
					unsigned int site_index = (itr->first)[order_index];
					locations->back().map.push_back(pair<unsigned int,unsigned int>(site_index, j));
					j = alpha[crm.size()-1-order_index][j].traceback;	// trace back yet another level back
				}
			}
		
			example_likelihood += order_likelihood;	// add contribution of this ordering to likelihood of this example

		} // next order

		ans.push_back(example_likelihood);

	} // next example

	return ans;

}





/**
 * Calculate the probability, for each of a list of sequences, that the 
 *	sequence was generated by the given crm model (as opposed to the background model)
 */

vector<probability> probabilityof(const CRM &crm, 
									const vector<Example*> &examples, 
									vector<CRMLocation> *locations) {



	vector<CRM> powerset = crm.powerset();
	vector<vector<probability> > path_lhood(powerset.size());

	unsigned int correct_path = (unsigned int)-1;

	for (unsigned int p=0; p<powerset.size(); p++) {
		
		bool is_correct = CRM::is_positive_path(powerset[p], crm);
			
		vector<CRMLocation> *loc_ptr = NULL;
		if (is_correct) {
			#if CRM_DEBUG
			assert(correct_path == (unsigned int)-1);	// one and only one correct path
			#endif
			correct_path = p;
			loc_ptr = locations; 	// only get locations for correct path
		}
			
		path_lhood[p] = likelihood(powerset[p], examples, loc_ptr);

	} // next CRM in power set

	#if CRM_DEBUG
	assert(0 <= correct_path && correct_path < powerset.size());	// assert correct path found
	#endif

	vector<probability> ans(examples.size());
	for (unsigned int x=0; x<examples.size(); x++) { 
		
		probability hmm_lhood_x = 0;	// denominator
		for (unsigned int p=0; p<path_lhood.size(); p++) { hmm_lhood_x += path_lhood[p][x]; }
		
		ans[x] = path_lhood[correct_path][x] / hmm_lhood_x;

	}
		
	return ans;
}



/** 
 *
 * Outside Calculator has two jobs: 1) Update an outside alpha matrix for each
 * [0..j), with a probability 2) Keep a probability, Z = P(i,j,m,r) for all
 * motif positions (i), previous CRM locations (j), motif-choice (m), strand
 * (r) and, when done, normalize these and apply to part of a learned model
 * (i.e., PWMs, motif choices--which m of M motif motifs, strand
 * preferences, and a distance model)
 *
 */
struct outside_calculator {

  public:
 	
	const string *sequence;	// sequence relation
	vector<DPCell> *alpha;	// (outside) alpha matrix
	Distance *distance;
	BindingSite *site;
	const probability prior;

  	outside_calculator(const string *s = NULL, vector<DPCell> *vpp = NULL, 
	    BindingSite *bs = NULL, Distance *d = NULL, probability p=1) 
			: sequence(s), alpha(vpp), distance(d), site(bs), prior(p) { }

	// update this object during an inside-outside calculation (see
	// BindingSite::inside_outside in CRM.h) i=position of first base in motif,
	// j=position of first nt in downstream CRM, m = motif ID, r=strand ID, z =
	// prob. of sequence given these settings
  	void operator()(unsigned int i, unsigned int j, unsigned int m, unsigned int r, const probability &v, const probability &z) {



		((*alpha)[j]).q += v;		// update outside alpha matrix (`j' because it's outside)

		#if CRM_DEBUG
		const unsigned int L = alpha->size() - 1;
		assert(i <= L && j <= L);
		assert(r < BindingSite::NUM_STRANDS);
		assert(m < site->multiplicity());
		assert(alpha);
		assert(z <= ((probability)1) && !isnan(z));
		assert(v <= ((probability)1) && !isnan(v));
		assert(!isnan(((*alpha)[j]).q));
		#endif


		const probability update = z * prior;

		if (update > (probability)0) {

			const unsigned int w = site->motif(m).width();

			site->motif(m).add(*sequence, update, r==BindingSite::TCX, i, i+w);
			site->motif_preference(m) += update;
			site->strand_preference(r) += update;

			distance->add(j-i, update);

		}

	} // operator()(i,j,m,r,z)
			
};	// outside_calculator


/** 

 * Run one EM iteration on each sequence (examples) return value is the sum of
 * sequence probabilities, which equals `used' CRM parameters' normalizing
 * constant.

 */
probability EM(const CRM &crm, const vector<Example*> &examples, CRM *used, CRM *normalized_sum, bool positive) {

	// Parameters:
	// 	crm:  current parameters
	//	examples:  set of training data
	//	positive:  if true, use positive sequences (weight = 1.0) to calculate normalized_sum, 
	//				otherwise, use (1.0 - weight).  i.e. setting positive=false, reverses example class labels.
	//
	// RETURN:
	//	used:  Parameters used to explain the sequence (unnormalized through the upper paths)
	//	normalized_sum:  Weighted and Renormalized after each sequence 

	//	Let each site be indexed 0 (most upstream), 1, 2, ... SITES-1 (most
	//	downstream)
	//
	//  The inside alpha matrix (a 1-D array for each site), inside[site][j] =
	//  P ( sequence[j..L) | `site' begins at `j', sites site-1, .. 0 appear
	//  somewhere further downstream )
	//
	//	The outside alpha matrix, outside[site][j] = P ( sequence[0..j) |
	//	`site' appears upstream of `j' (as does site+1, site+2, ...
	//	CRM::size()-1), `site' is followed by background up to, but not
	//	including position `j', where site-1 is free to appear.
	//
	//	inside alpha array (sometimes I'll call it matrix, but it's only 1-D):
	//	inside[j] represents the likelihood of the sequence data starting at
	//	'j', through the end of the sequence, assuming the CRM starts at
	//	position 'j' (i.e. the most downstream motif starts there).  
	//
	//	Note that outside[CRM::size()] represents P( sequence[0..i) |
	//	background ), where `i' is assumed to be the position of the last (most
	//	upstream) motif in the CRM, basically the sequence probability upstream
	//	of the CRM.
	// 
	//	To do E-M:
	//
	//	1) For each possible ordering, and for sites, 0, 1, ..., calculate the
	//	corresponding inside_alpha matrix (using the inside_alpha matrix from
	//	the previous iteration) and keep it in memory.  This uses an
	//	inside_calculator object.  When BindingSite::inside_outside calls
	//	inside_calculator::operator(), it just adds to the next iteration's
	//	inside_alpha matrix.
	//
	//	2) Calculate the probability of the sequence for each ordering and
	//	update the P(order) parameters
	//	
	//	3) For each ordering, and for sites CRM::size()-1 to 0, counting down, calculate the next
	//	iteration's outside_alpha matrix given the outside_alpha matrix from
	//	the previous iteration and the inside_alpha matrix corresponding to the
	//	site downstream of this one.  For each position/motif-choice/strand,
	//	the BindingSite::inside_outside function will calculate the probability
	//	and (position/motif-choice/strand).  call outside_calculator(), which
	//	will i) add to the outside_alpha matrix for the next iteration, and ii)
	//	update the `trainee' CRM model
	//

    //    site        outside alpha                 inside alpha
    //
    //                                           0 1 2 ...       L  (alpha matrix indices)
    //              _ _ _ _ _ _ _ _ _            _ _ _ _ _ _ _ _ _  
    //      0      |_|_|_|_|_|_|_|_|_|   <--    |_|_|_|_|_|_|_|_|_|
    //                      ^                              |  inside_matrices[0]
    //                      |                              v
    //              _ _ _ _ _ _ _ _ _            _ _ _ _ _ _ _ _ _  
    //      1      |_|_|_|_|_|_|_|_|_|   <--    |_|_|_|_|_|_|_|_|_|
    //  outisde_matrices[1] ^                              |
    //                      |                              v
    //              _ _ _ _ _ _ _ _ _            _ _ _ _ _ _ _ _ _  
    //      2      |_|_|_|_|_|_|_|_|_|   <--    |_|_|_|_|_|_|_|_|_|
    //                      ^                              
    //                      |                              
    //              _ _ _ _ _ _ _ _ _          
    //             |x|y|z|.|.|.|_|_|_|    
    //                      outside_matrices[NUM_SITES=3]
    //                                                                                   
    //              

	CRM seq_trainee = crm.structure(0);	// copy structure, fill to zero;
										//	`seq_trainee' will be used to learn-and-normalize
										//	each sequence if `normalize'==true
	
	if (used) { *used = seq_trainee; }	// copy that (just because it's faster)
	if (normalized_sum) { *normalized_sum = seq_trainee; };

	const unsigned int S = examples.size();

	vector<vector<DPCell> > inside(crm.size()+1);
	vector<vector<DPCell> > outside(crm.size()+1);

	DPCell blank_DPCell(0,0,0);

	probability ans = 0;	// noto-power-change

	for (unsigned int s=0; s<S; s++) {

		const Example *example = examples[s];	// shorthand

		CRM *trainee = NULL;

		seq_trainee.fill(0);		// blank the parameters, I'll add to it for this sequence
		trainee = &seq_trainee;		// because of earlier versions, I'll use a pointer to `trainee' from here on
		
		const unsigned int L = example->sequence()->length();

		// resize alpha matrices (resize is better since sequences are often the same length)
		for(unsigned int i=0; i<crm.size(); i++) { 
			if (inside[i].size() != L+1) { inside[i].resize(L+1, blank_DPCell); }
			if (outside[i].size() != L+1) { outside[i].resize(L+1, blank_DPCell); }
		}
		outside[crm.size()].resize(L+1);
		for (unsigned int i=0; i<=L; i++) {
			outside[crm.size()][i].q = example->bg(0,i);
		}

		probability Ps = 0;	// sequence probability

		// special case:  if CRM is zero, sequence probability is all background
		if (crm.size() == 0) { Ps = example->bg(0, L); }

		// for all possible orderings
		const list<vector<unsigned int> > orders = enum_order(crm.size());
		const list<pair<vector<unsigned int>, probability> > priors = order_priors(crm, orders);
		for (list<pair<vector<unsigned int>, probability> >::const_iterator itr=priors.begin(); 
		                                                      itr!=priors.end(); itr++) {

			if (itr->second == 0) { continue; }	// save (a lot?) of time if ordering is impossible (?)

			const vector<unsigned int> *order = &(itr->first);
			const probability order_prior = itr->second;

			for (unsigned int i=0; i<crm.size()-1; i++) {

				// 2006-07-14, discrepency in the interpretation of "order."
				//	order is always (most upstream = left -> most downstream = right)
				//	However, we run inside_outside from downstream->upstream.
				const unsigned int this_bs_index = (*order)[crm.size() - 1 - i];
				const unsigned int downstream_bs_index = (i==0) ? ((unsigned int)-1) : (*order)[crm.size() - i];

				std::fill(inside[i].begin(), inside[i].end(), blank_DPCell);
				inside_calculator IC(&(inside[i]));

				// call inside/outside
				inside_outside(
					&(crm.site(this_bs_index)),
				    example,
					i==0 ? NULL : &(inside[i-1]),
					NULL,	// no outside alpha matrix
					(i==0) ? &(crm.distance(this_bs_index)) : &(crm.distance(this_bs_index, downstream_bs_index)),
					false,	// false <=> no overlap
					(crm.size() == 1 ? CRM::magic_ratio1 : CRM::magic_ratio2),
					IC);

			} // next site

			// Now, run inside-outside for each site.  We've already set up the "inside"
			// alpha matrices and we'll be setting up the "outside" matrices as we
			// go through these, so start with the last site (size()-1), and move
			// up

			for (unsigned int i=crm.size()-1; i<crm.size(); i--) {

				// 2006-07-14, discrepency in the interpretation of "order."
				//	order is always (most upstream = left -> most downstream = right)
				//	However, we run inside_outside from downstream->upstream.
				const unsigned int this_bs_index = (*order)[crm.size() - 1 - i];
				const unsigned int downstream_bs_index = (i==0) ? ((unsigned int)-1) : (*order)[crm.size() - i];

				std::fill(outside[i].begin(), outside[i].end(), blank_DPCell);

				outside_calculator OC( example->sequence(), &(outside[i]), 
				                       &(trainee->site(this_bs_index)),
									   (i==0) 
									   		? &(trainee->distance(this_bs_index)) 
											: &(trainee->distance(this_bs_index, downstream_bs_index)) ,
									   order_prior);

				inside_outside(

					&( crm.site(this_bs_index) ),
					example, 
					
					i==0 ? NULL : &(inside[i-1]),
					&(outside[i+1]),	// outside[crm.size()] already initialized, will not change per order
					
					(i==0) ? &(crm.distance(this_bs_index)) : &(crm.distance(this_bs_index, downstream_bs_index)),
					
					false,	// false <=> no overlap b/c I'm learning PWMs
					(crm.size() == 1 ? CRM::magic_ratio1 : CRM::magic_ratio2),
					OC);
					
			} // next site


			// I may now update the next generation's ordering probabilities
			if (crm.size() > 1) { 
				for (unsigned int A=0; A<crm.size(); A++) { 
				for (unsigned int B=A+1; B<crm.size(); B++) { 
					trainee->order( (*order)[A] , (*order)[B] ) += (outside[0][L]).q * order_prior;
				}}
			}	

			Ps += (outside[0][L]).q * order_prior;	//sequence probability

		} // next order


		if (used) { used->add(seq_trainee); }

		const probability weight = positive ? example->weight() : ((probability)1 - example->weight());

		if (normalized_sum) { 
			seq_trainee.normalize();
			normalized_sum->add(seq_trainee, weight);	// add extent to which a APPROPRIATE example uses this path
		}

		ans += Ps;	// return value:  sum of sequence probabilities, which
							//	equals `used' CRM parameters' normalizing constant

	} // next example sequence, s

	return ans;

} // EM



