#ifndef CRM_H
#define CRM_H

#define CRM_DEBUG 0

#if CRM_DEBUG
#include <assert.h>
#endif

#include "genomic.h"
#include "PWM.h"
#include "logscale.h"
#include "markov.h"
#include "cvector.h"
#include "probability.h"	// probability type defined here (for CRM and Distance)
#include "Distance.h"

#include <list>
#include <vector>
#include <string>
#include <sstream>
#include <iostream>
#include <algorithm>
using namespace std;

class Example;			// CRM learning example:  sequence data + background probabilities
class BindingSite;
struct DPCell;			// Dynammic programming cell
class CRM;
struct CRMLocation;

ostream& operator<<(ostream &out, const CRM&);
ostream& operator<<(ostream &out, const CRMLocation&);

/** 
 *	Return a list of the n! vectors of numbers, 0 through n-1, each vector
 *	representing a unique ordering
 */
list<vector<uint> > enum_order(uint n);

/** return a list of orders and prior order probabilities, based on normalizing the product of 
	all partial orders */
list<pair<vector<uint>, probability> > order_priors(const CRM &crm, const list<vector<uint> > &orders);


/**
 * Calc. P(sequence|CRM)
 */
vector<probability> likelihood(const CRM &crm, const vector<Example*> &examples, vector<CRMLocation> *locations=NULL);

/** 
 * Calculate probability that crm generated example instead of background
 */
vector<probability> probabilityof(const CRM &crm, const vector<Example*> &examples, vector<CRMLocation> *locations=NULL);

/**
 * Run the E-M algorithm on `crm', and return the result.
 *
 *	used:  parameters used by all sequences
 *	normalized_sum:  sum of parameters used in positive sequences, normalized over each sequence
 *						(i.e. values will sum to number of positive sequences, not one)
 *
 *  positive:  set to false if you want normalized_sum to normalize param's used over negative sequences
 *
 *	return:  sum over sequences of P(sequence | CRM) = CRM normalizing constant
 *
 **/
probability EM(const CRM &crm, const vector<Example*> &examples, CRM *used, CRM *normalized, bool positive);



/** Skeleton for the inside or inside-outside calculation (see below) */
template <typename _Functor>
void inside_outside(const BindingSite *bs, const Example *example, 
					const vector<DPCell> *inside_alpha, const vector<DPCell> *outside_alpha, 
					const Distance *distance,
                    bool allow_overlap, const probability &magic, _Functor &f);







/** 
 *	Represents an example (i.e., a DNA sequence) coupled with a background probability
 *	distribution (i.e. for any subsequence [i..j), this object can calculate the 
 *	likelihood of the subsequence given some background model.
 *
 *	Subclass this.
 *
 */
class Example {

  public:
	virtual ~Example() { }
	
	virtual const string* sequence() const = 0;

	/** Look up the example's ``weight'', 1==POSITIVE, 0==NEGATIVE */
	virtual const probability& weight() const = 0;	

	/** Look up the example's ``name'', a unique ID */ 
	virtual const string& name() const = 0;
	
	/** Background Sequence Probability:  P(sequence[i..j) | BG model) */
	virtual probability bg(uint i, uint j) const = 0; 

	/** Motif-Model Underlying Probability:  P(underlying[i..j) | Binding site) */
	virtual probability prior(uint i, uint j) const { return 1; }
	
};



class BindingSite {

  public:
  	static const uint TMPL=0, TCX=1, NUM_STRANDS=2;

	bool negated;

  private:

  	cvector< pair< PWM<probability> , probability > > motifs;
	probability strand[NUM_STRANDS];
  
  public:

  	BindingSite() { negated = false; }

	inline PWM<probability>& motif(uint index) { return motifs[index].first; }
	inline const PWM<probability>& motif(uint index) const { return motifs[index].first; }

	inline probability& motif_preference(int index) { return motifs[index].second; }
	inline const probability& motif_preference(int index) const { return motifs[index].second; }

	inline probability& strand_preference(uint index) { return strand[index]; }
	inline const probability& strand_preference(uint index) const { return strand[index]; }

	void normalize();
	void pseudocount(const probability &pc);
	void fill(const probability &value=0);
	void add(const BindingSite &bs, const probability &weight=1);	// add parameters from identical structure
	bool subtract(const BindingSite &bs, const probability &weight=1);	// subtract; return true iff no zeros

	void insert(uint position, const PWM<probability> &pwm) 
		{ this->motifs.insert(motifs.begin() + position, pair<PWM<probability>,probability>(pwm, 1)); }
	void erase(uint position) { motifs.erase(motifs.begin() + position); }
	void replace(uint position, const PWM<probability> &pwm) { this->motif(position) = pwm; }

	uint size() const { return motifs.size(); }
	uint multiplicity() const { return motifs.size(); }
};

class CRM {

  public:

	// cis-Regulatory Module "global" parameters:
	
  	static probability magic_ratio1;	// for single site, if P(BG)/P(CRM) < M.R., I'll skip eval of this motif instance
  	static probability magic_ratio2;	// for two+ sites, if P(BG)/P(CRM) < M.R., I'll skip eval of this motif instance
	static probability MDA;			// Max. Distance Area:  How much of the distance area should be considered (ignore the tail end after this much of the mass)
	static uint MAXL;	// Distance range (0 through MAXL-1)
	static uint BINW;	// bin width (bin contiguous distances)
	static double GammaBeta;// Initial distance distribution (GammaBeta=0 <=> uniform, GammaBeta!=0 <=> Gamma dist. with alpha=2, beta=GammaBeta)
	static uint FPP;	// File Parameter Precision (Can be annoying to look at 50 decimal places.  You down with FPP?)


	static uint MINIMUM_IO_ITERATIONS;	// Can't break out for sufficient
										// probability until this many values are explored
	static uint MAXIMUM_IO_ITERATIONS;	// Won't execute more than this many possible values for `i' or `j'
										// NOTE:  I think this is dangerous and should never be set.

  private:

  	cvector<BindingSite> binding_sites;
	cvector<cvector<pair<probability, Distance> > > pairwise_preferences;	
	void reset(Distance &D) { D.resize(CRM::MAXL+1, CRM::BINW); }

	#if CRM_DEBUG
	bool check() const;	// check assumptions made by data structure
	#endif

	/** Given a site bitmap (least sig. bit == site index 0), create a CRM 
		which is a subset of this.  Copy pairwise preferences appropriately. */
	CRM subset(uint) const;

  public:

  	inline BindingSite& site(uint index) { return binding_sites[index]; }
  	inline const BindingSite& site(uint index) const { return binding_sites[index]; }

	inline probability& order(uint a, uint b) { return pairwise_preferences[a][b].first; }
	inline const probability& order(uint a, uint b) const { return pairwise_preferences[a][b].first; }

	inline Distance& distance(uint a) { return pairwise_preferences[a][a].second; }
	inline const Distance& distance(uint a) const { return pairwise_preferences[a][a].second; }

	inline Distance& distance(uint a, uint b) { return pairwise_preferences[std::min(a,b)][std::max(a,b)].second; }
	inline const Distance& distance(uint a, uint b) const { return pairwise_preferences[std::min(a,b)][std::max(a,b)].second; }
	
	inline uint size() const { return binding_sites.size(); }

	void pseudocount(const probability &pc);
	void add(const CRM &addend, const probability &weight=1, uint sitemap = (uint)-1);	// add parameters (sitemap is bitmap of which sites in `this' `addend' makes up)
	bool subtract(const CRM &addend, const probability &weight=1, uint sitemap = (uint)-1);	// subtract; return true iff no zeros
	void cat(const CRM &peer);	// add-on binding sites of peer (unknown pairwise parameters <- uniform)
	void fill(const probability &value=0);	// reset everything 
	void normalize();	// normalize everything

	CRM structure(probability value=0) const;	// copy structure (and fill to value)
	
	void add_site(const BindingSite &site);	// add to end of list
	void add_site(const PWM<probability> &pwm);
	void add_motif(uint index, const PWM<probability> &pwm);
	void remove_site(uint index);

	void print(ostream &out) const;
	void summarize(ostream &out) const;	// one-line version of `print'

	void dump_order(ostream &out) const;
	void dump_distance(ostream &out) const;

	uint negated() const;	/* number of negated sites */

	/** Decide whether the given CRM subset (of binding sites) is the positive version of the given CRM.
		This is the case iff the subset has all and only the non-negated sites of the given CRM */
	static inline bool is_positive_path(const CRM &subset, const CRM &crm) {
		return ( subset.negated() == 0 && subset.size() == crm.size() - crm.negated() );
	}

	/** Create a CRM for each possible subset of binding sites (see CRM::subset) */
	vector<CRM> powerset() const;

	/* File I/O (for saving CRMs), implemented in io.cpp */
	void write(ostream &out) const;
	bool read(istream &in);
	static string file_version();

};


/**
 *
 * The skeleton of the (analogy to) the inside-outside algorithm.
 *
 * For each "previous" (upstream) CRM model parse, For each position of this
 * binding site (may be multiple motifs to try), Calculate `z' =  the
 * probability of the model parameters (order, distance i->j, motif choice,
 * strand choice) times the likelihood of the sequence data [0..L) (or just
 * [i..L) if no outside_alpha array is provided).
 
 
                         upstream
             background   motif     background      "previous" CRM model
                          _____                    _____________________
             ____________|_____|__________________|_____________________   DNA
    index:   0            i     k                  j                    L 
 

 * Call `f(i,j,m,r,z)' to do whatever you want with this information
 *
 *
 **/  

template <typename T1, typename T2>
struct SortPairByFirstDecending { 
	SortPairByFirstDecending() { }
	bool operator()(const pair<T1,T2> &LHS, const pair<T1,T2> &RHS) {
  		return (RHS.first < LHS.first);
	}
};

/**
 * Data for traceback/finding the MLE CRM location
 */
struct CRMLocation { 
  public:
	probability likelihood;			// likelihood of the traceback
	list<pair<uint, uint> > map;	// pairs of {site index, sequence position}
	CRMLocation() : likelihood(0) { }
	CRMLocation(const probability &al) : likelihood(al) { }
};




struct IMR { 
	uint i,m,r; 
	IMR() { };
	IMR(uint ai, uint am, uint ar) : i(ai), m(am), r(ar) { }
}; 


/**
 * Dynamic Programming Cell for this algorithm:  
 *	A subsequence probability (q), and traceback pointer
 *	(index of downstream submodel, and likelihood)
 */
struct DPCell {
	probability q;	// subsequence probability (main entry)
	uint traceback;	// traceback pointer (e.g. 'j')
	probability z;	// probability of traceback (replace with higher values)
	DPCell() { }
	DPCell(const probability &aq, uint at, const probability &az) : q(aq), traceback(at), z(az) { }
};


template <typename _Functor>
void
inside_outside(
	const BindingSite *bs,
	const Example *example,
    const vector<DPCell> *inside_alpha, 
	const vector<DPCell> *outside_alpha,
	const Distance *distance,
	bool allow_overlap,			// calculate a `z' even for overlapping motifs
	const probability &magic,	// ratio of probability mass of locations to consider
	_Functor &f) {



	/*

	For each j in order of decreasing likelihood

		For each i in order of decreasing likelihood

			z =	   P(m)
				 * P(r)
				 * P(prior motif @ i)
				 * P(distance i->j)
				 * P( x[j..L-1] | inside )
				 * P( x[i..i+k-1] | motif, strand )
				 * P( x[i+k..j-1] | BG )
				 * P( x[0..i-1] | outside )
				 / Z

			where Z is a normalizing factor (probability of all distances, priors)

			(break iff examined enough likely values of `i')

		Next i
			
		(break iff examined enough likely values of `j')

	Next j

	*/




	

	const uint L = example->sequence()->length();

	#if CRM_DEBUG
	assert(bs);
	assert(distance);
	#endif

	vector<pair<probability, uint> > Jrank;	
	probability Jsum = 0;

	vector<pair<probability, IMR> > IMRrank;
	probability IMRsum = 0;

	if (inside_alpha) {
		for (uint j=L; j<=L; j--) {
			// push back ( odds(sequence/background) , j )
			probability Pj = example->bg(0,j) * ((*inside_alpha)[j]).q; 
			Jrank.push_back(pair<probability,uint>( Pj , j ) );
			Jsum += Jrank.back().first;
		}
	} else {
		Jrank.push_back(pair<probability, uint>(1, L));
	}
	std::sort(Jrank.begin(), Jrank.end(), SortPairByFirstDecending<probability,uint>());

	for (uint r=0; r<BindingSite::NUM_STRANDS; r++) { 
	for (uint m=0; m<bs->multiplicity(); m++) { 
		const int w = bs->motif(m).width();	// shorthand
		const probability Prm = bs->motif_preference(m) * bs->strand_preference(r);
		for (uint i=L-w; i<=L; i--) { // i:  start of motif
			// push back ( avg. P(base|motif) , (i,m,r) )
			probability Pimr =   Prm
			                   * bs->motif(m).likelihood(*(example->sequence()), r==BindingSite::TCX, i, i+w)
							   * example->prior(i,i+w);
			IMRrank.push_back( pair<probability,IMR>(Pimr , IMR(i,m,r)) );
			IMRsum += Pimr;
		}
	}}
	std::sort(IMRrank.begin(), IMRrank.end(), SortPairByFirstDecending<probability, IMR>());

	uint MIN_REQ_J = L;
	uint MAX_REQ_I = 0;
	if (magic < 1) {
		probability i_mass = 0;
		uint i_iterations = 0;
		for (vector<pair<probability,IMR> >::const_iterator i_itr=IMRrank.begin(); i_itr!=IMRrank.end(); i_itr++) {
				
			const uint i = i_itr->second.i;			// i:  position of start of motif
			const uint m = i_itr->second.m;			// m:  motif (which motif used by this site?)
			const uint w = bs->motif(m).width();	// shorthand, w:  motif width

			if (i > MAX_REQ_I) { 
				MAX_REQ_I = i; 
				MIN_REQ_J = i + w;
			}
			i_iterations++;
			i_mass += i_itr->first;

			if (i_iterations > CRM::MINIMUM_IO_ITERATIONS && i_mass > magic * IMRsum) { break; }
			if (i_iterations >= CRM::MAXIMUM_IO_ITERATIONS) { break; }
		}
	}

	probability j_mass = 0;	
	uint j_iterations = 0;
	bool MIN_REQ_J_SAT = false;
	for (vector<pair<probability,uint> >::const_iterator j_itr = Jrank.begin(); j_itr != Jrank.end(); j_itr++) {

		j_mass += j_itr->first;
		if (j_iterations > CRM::MINIMUM_IO_ITERATIONS 
			&& MIN_REQ_J_SAT
			&& j_mass > magic * Jsum) { break; }
		if (j_iterations >= CRM::MAXIMUM_IO_ITERATIONS) { break; }

		const uint j = j_itr->second;	// j:  position of first character in previous CRM (downstream)

		j_iterations++;
		MIN_REQ_J_SAT = MIN_REQ_J_SAT || ( j >= MIN_REQ_J );

		probability P_jL =  (inside_alpha) ? ((*inside_alpha)[j]).q : (probability)1; 
			// P(sequence&prior[j..L))

		#if CRM_DEBUG
		assert(!isnan(P_jL));
		#endif

		// Now we've established 'j', find prior probabilitie(s) of i
		
		// PASS #1:  Find prior probabilities given distance X prior (i.e. calc. normalizing constant, pZ)
		probability pZ = 0;
		if (example->prior(0,L) < 1) {
			probability i_mass = 0;
			uint i_iterations = 0;
			for (	vector<pair<probability,IMR> >::const_iterator i_itr=IMRrank.begin();
					i_itr!=IMRrank.end(); 
				 	i_itr++		) {

				const uint i = i_itr->second.i;			// i:  position of start of motif
				const uint m = i_itr->second.m;			// m:  motif (which motif used by this site?)
				// const uint r = i_itr->second.r;			// r:  motif strand (0 or 1)
				const uint w = bs->motif(m).width();	// shorthand, w:  motif width
				const uint k = i+w;						// shorthand, k:  start of BG after motif
				const uint D = (j-i);					// distance i->j

				i_mass += i_itr->first;
				if (i_iterations > CRM::MINIMUM_IO_ITERATIONS && i_mass > magic * IMRsum) { break; }
				if (i_iterations >= CRM::MAXIMUM_IO_ITERATIONS) { break; }
				
				// make sure `i' is legal given `j' (we know `i' <= L-w) 
				if (i >= j || (!allow_overlap && k > j)) { continue; }
			
				if (distance->prob(D) == 0) { continue; }	// 2006-07-18, don't count impossible distance as iteration

				i_iterations++;

				if (distance->CDF(D) >= CRM::MDA) { break; } 	// skip tail of distance distribution

				probability P =   example->prior(i,k) * distance->prob(D);
								// 2006-08-16, prev. included P(m, r), but this isn't part of pZ
								// * bs->motif_preference(m) * bs->strand_preference(r)

				pZ += P;

			}
		} else {
			pZ = distance->CDF(j);	// no prior distribution, normalizing const. for distance only
									//	is area from j on up.
		}

	
		#if CRM_DEBUG
		assert(!isnan(pZ));
		#endif
		
		
		#if CRM_DEBUG
		{
			static bool pZ_zero_warned = false;	// only warn once for this debugging message
			
			uint M = 0;	// find largest motif width
			for (vector<pair<probability,IMR> >::const_iterator i_itr=IMRrank.begin(); i_itr!=IMRrank.end(); i_itr++) { 
				if (bs->motif(i_itr->second.m).width() > M) { M = bs->motif(i_itr->second.m).width(); }
			}

			if (!pZ_zero_warned && j >= M && pZ == 0) {
				// should be room for a motif, yet pZ = 0
				cerr << endl
					 << "Warning (CRM debug mode):  "
					 << "CRM instantiation probability (pZ) = 0.  "
					 << "The probability may legitimately be zero because "
					 << "of zeros in the model.  "
					 << "This warning will only appear once per CRM call to Inside/Outside "
					 << "(j = " << j << ", M = " << M << ".)  "
					 << endl << endl;
				pZ_zero_warned = true;
			}
		}
		#endif

		// PASS #2:  Calculate contribution of i,j (and m,r) -> z, and call functor
		probability i_mass = 0;
		uint i_iterations = 0;
		for (vector<pair<probability,IMR> >::const_iterator i_itr=IMRrank.begin();
		     i_itr!=IMRrank.end(); 
			 i_itr++) {
			 
			 if (pZ == 0) { break; } //  2006-07-12, pZ may legitimately be zero.
			
			const uint i = i_itr->second.i;			// i:  position of start of motif
			const uint m = i_itr->second.m;			// m:  motif (which motif used by this site?)
			const uint r = i_itr->second.r;			// r:  motif strand (0 or 1)
			const uint w = bs->motif(m).width();	// shorthand, w:  motif width
			const uint k = i+w;						// shorthand, k:  start of BG after motif
			const uint D = (j-i);					// distance i->j

			i_mass += i_itr->first;

			if (i_iterations > CRM::MINIMUM_IO_ITERATIONS && i_mass > magic * IMRsum) { break; }
			if (i_iterations >= CRM::MAXIMUM_IO_ITERATIONS) { break; }
			
			// make sure `i' is legal given `j' (we know `i' <= L-w) 
			if (i >= j || (!allow_overlap && k > j)) { continue; }

			if (distance->prob(D) == 0) { continue; }	// 2006-07-18, don't count impossible distance as iteration
			
			i_iterations++;
			
			if (distance->CDF(D) >= CRM::MDA) { break; } 	// skip tail of distance distribution


			// prob. of distance, sequence&prior[0..i),[k,j),[j..L) 
			probability P_d_0i_kL =   P_jL
									* ((outside_alpha) ? ((*outside_alpha)[i]).q : (probability)1)
									* distance->prob(D);
			
			if (k<=j) { P_d_0i_kL *= example->bg(k,j); }	// mult in BG[k..j) iff no overlap

			#if CRM_DEBUG
			assert(!isnan(P_d_0i_kL));
			assert(0 <= P_d_0i_kL && P_d_0i_kL <= 1);
			#endif

			// Recall that i_itr->first = P(choosing m,r, sequence and motif prior [i..i+w))

			// z = P(whole sequence | CRM Model)
			//   = P(choosing motif,strand * sequence 0..L * motif priors 0..L | i, j )
			//   =   P(distance, sequence&prior 0..i, k..L) 
			//     * P(motif,strand * sequence&prior i..k)
		
			probability z =   P_d_0i_kL			// P(bg[0..i) * bg[k..j) * alpha[j..L) * P(distance)
			                * i_itr->first		// P(m,r) * PWM[i..k) * prior(i)
							/ pZ;				// distance X prior normalizing constant

			#if CRM_DEBUG
			assert(!isnan(z));
			assert(0 <= z && z <= 1);
			#endif

			// v = likelihood of sequence(0..j)
			probability v = (z==0) ? (probability)0 : z / ((inside_alpha) ? ((*inside_alpha)[j]).q : (probability)1);

			#if CRM_DEBUG
			assert(!isnan(v));
			assert(0 <= v && v <= 1);
			#endif

			f(i,j,m,r,v,z);	// i=upmotif, j=downCRM, m=disjunct, r=strand, v=??, z=total-sequence-prob.

		} // next i,m,r (motif choice by this binding site)

	} // next j (outside loop)

	return;

} // inside_outside


#endif
