/*

  Functions for a homogeneous Markov model representing DNA sequence data
                  -----------                           ---

 Markov data is kept in a 

 	vector<vector<double> > 

 One RMO array for each order, 0, 1, ... N

 The 0th entry is an array of length 4, i.e. [ A-count, C-count, G-count, T-count ]
 The 1st entry is an array of length 16, i.e. [ A-A-count, A-C-count, ..., T-T-count ]
 The Nth entry is an array of length 4**N, i.e. [ A-...-A-A-count, A-...-A-C-count,..., T-...-T-T-count ]

 The 0th entry is indexed [ 0, 1, 2, 3 ]
 The 1st entry is indexed [ 0, 1, ... 16 ]

 In general, in base-4 arithmetic, the index of CTAGG is just CTAGG of A=0,C=1,G=2,T=3.
 This can be calculated by reading a digit, pushing its bits (A=00, C=01, G=10, T=11)
 onto a number, then shifting two to the left for the next nucleotide.  This means the
 maximum order is half the number of bits in the representation (unless I want to change
 to another data structure).

 To calculate the index of a sequence:

 	index = 0000000;
	for letter in sequence:
		index = index << 2;
		index |= letter's-bits (i.e. A=00, T=11)
	
 To update a number:

	mask = ~(-1 << 2*(order+1))  		// order+1 because order is PREVIOUS bases, +1 for the base in question
										// shift -1 because we want all the bits that are part of that number
										// complement because shifting -1 gives you 11...10000, we want 00...01111
			
	number = (number << 2) & mask;		// shift OUT the old least-recent-base, make room for the next

	number = number & new-base-digits

 To calculate the sequence from the index:

	for order from highest (order of MM) to and incl. zero:
		
		mask = 3 << (2*order);		// 2 bits/letter
		number = (index & mask) >> (2*order);	// mask with location for this order, move to first position
		letter = A for 0, C for 1, G for 2, T for 3.
		output letter.
	
	next order

  IMPORTANT:  This class handles DNA only (i.e. letters 'a','c','g', and 't'.  'n' and IUPAC letters
  	are treated as a break in sequence (they do not contribute to the trained parameters, nor to the
	sequence likelihood.  The reason is because they represent multiple combinations of previous bases).


-------------------------------------------------------------------------------------------


*/

#ifndef MARKOV_H
#define MARKOV_H

#define MARKOV_DEBUG 1
#if MARKOV_DEBUG
#include <assert.h>	// only assert in debug mode
#endif

#include <math.h>
#include <vector>
#include <iostream>
#include <iomanip>
using namespace std;

#include "cvector.h"
#include "genomic.h"	// all the DNA information used here
#include "WSS.h"


template <typename T> class MM;

const char MM_COMMENT = '#';
const uint MM_PRECISION = 10;

template <typename T>
class MM {

  private:
  	cvector<T*> data;

  public:

	MM<T>() { }
	MM<T>(uint order) { resize(order); }
	MM<T>(const string &filename);
	MM<T>(istream&);
	~MM<T>();

	// read-only access to an entry
	const T& entry(uint order, uint index) const { return data[order][index]; }
	const T& operator()(uint order, uint index) const { return data[order][index]; }
	
	/** harmless subroutines: */
	static unsigned long requirement(uint order) { return (1 << 2*(order+1)); }	// number of entries/order
	static unsigned long mask(uint i) { return (~(-1 << 2*(i))); }
	static unsigned long sequence2index(const string &sequence, uint start=0, uint end=(uint)-1);
	static string index2sequence(unsigned long index, const uint bases);

	/** iterate through entries in an order */
	const T* begin(uint order) { return data[order]; }
	const T* end(uint order) { return data[order]+requirement(order); }

	void resize(const uint order);	// set to a new order
	void fill(const T& = (T)0);	// set all entries to given value (usually clear to zero)

	uint order() const { return data.size() - 1; }

	void read(istream&);
	void write(ostream&) const;
	
	void pseudocount(const T &pc=1);		// add a sequence to each entry
	void normalize();
	void add(const string &sequence, const T &weight=1, uint start=0, uint end=(uint)-1);

	T likelihood(const string &sequence, uint start=0, uint end=(uint)-1) const;

	vector<T> subsequence_likelihood(const string &sequence, uint start=0, uint end=(uint)-1) const;

}; // class MM

template <typename T>
void MM<T>::resize(const uint order) {

	uint bases = order+1;	// 5th order -> actually store 6 bases
	uint old_size = data.size();
	uint new_size = bases;

	if (new_size < old_size) { 
		for (uint i=new_size; i<old_size; i++) { 
			if (data[i]) { delete[] data[i]; }
		}
	}
	
	data.resize(new_size);
	
	for (uint i=old_size; i<new_size; i++) { 
		unsigned long R = MM::requirement(i);
		data[i] = new T[ R ];
		std::fill(data[i], data[i] + R, 0);	// this is a good idea.  I don't care if it does take 4.2x10-332 nanoseconds
	}
}

template <typename T>
void MM<T>::fill(const T &value) {

	for (uint i=0; i<data.size(); i++) { 
		unsigned long R = MM::requirement(i);
		std::fill(data[i], data[i]+R, value);
	}
}



template <typename T>
MM<T>::~MM() {
	for (uint i=0; i<data.size(); i++) { 
		delete[] data[i];
	}
}


/**  
 *	return vector of probabilities V, such that V[i] = Probability of sequence from start up to 
 *	but not including start+i.
 *
 **/
template <typename T> 
vector<T> MM<T>::subsequence_likelihood(const string &sequence, uint start, uint end) const {

	if (end > sequence.length()) { end = sequence.length(); }
	
	const uint L = end-start;	// number of letters to worry about
	
	vector<T> ans(L+1);

	const unsigned long mask = MM::mask(data.size());

	ans[0] = 1;
	uint lookahead = 0;	// order to use (start at 0, no letters so far)
	ulong index = 0;

	for (uint i=0; i<L; i++) {

		const uint p = start + i;	// sequence position
		const uint DNA_ID = dna_id(sequence[p]);

		if (DNA_ID < DNA_ABLEN) { 

			// a c g t
			
			index = ((index << 2) & mask) | DNA_ID;	// include most recent base
			ans[i+1] = ans[i] * this->data[lookahead][index];

			if (lookahead < (data.size()-1)) { lookahead++; }	// we can handler larger order 

		} else if (sequence[p] == 'N' || sequence[p] == 'n') { 
	
			// given letter is not {ACGT}, 'N' is a common enough case (?) to make calc. here fast
			ans[i+1] = ans[i];	// i.e. 100% chance of "any base"
		
		} else {

cerr << "*** Markov.h:  encountered '" << sequence[p] << "' @ " << p << endl;

			T sum = 0;
			bool found_iupac = false;
			for (uint iupac_i=0; iupac_i<IUPAC_ALPHABET.length(); iupac_i++) { 
				if (sequence[p] == IUPAC_ALPHABET[iupac_i] || sequence[p] == iupac_alphabet[iupac_i]) { 
					for (uint b=0; b<DNA_ABLEN; b++) { 
						if ( iupac_i & (1 << b) ) { 
							// match
							ulong index_b = ((index << 2) & mask | b);
							sum += this->data[lookahead][index_b];
cerr << "*** \tmatches " << b << " ('" << (dna_alphabet[b]) << "'), adding " << (this->data[lookahead][index_b]) << " for a total of " << sum << endl;
						}
					}
					found_iupac = true;
					break;
				}
			}

			if (!found_iupac) { 
				cerr << "WARNING:  In calcuating [sub]sequence likelihood | Markov model, "
					 << "encountered illegal character '" << sequence[p] << "'." << endl;
				#if MARKOV_DEBUG
				cerr << "[Sub]sequence likelihood is therefore zero." << endl
					 << "The '" << sequence[p] << "' occurred at position " << p
					 << " in sequence:\n\n\t" << sequence << "\n" << endl;
				#endif
			}

			ans[i+1] = ans[i] * sum;	

			// since letter in alphabet, reset lookahead
			lookahead = 0;
			index = 0;

		}

		#if MARKOV_DEBUG
		if (isnan(ans[i+1]) || !(0<=ans[i+1] && ans[i+1]<=1)) { 
			cerr << "markov.h:  MM::likelihood:  subsequence_likelihood()[" 	
				 << i << "+1] = " << ans[i+1] << endl;
			cerr << "i = " << i << endl;
			cerr << "ans[" << i << "] = " << ans[i] << endl;
			cerr << "sequence[start+i] = '" << sequence[p] << "'" << endl;
			exit(-1);
		}
		#endif

	}

	return ans; 
}

template <typename T>
void MM<T>::read(istream &in) {

	WSS wss;

	while (in.good()) { 

		string sequence;
		T t=0;

		// read, and parse out comment lines
		while (in.good()) { 
			in >> sequence >> wss;
			if (sequence.length() > 0 && sequence[0] == MM_COMMENT) { 
				while (in.good() && !wss.newline) { 
					string dummy;
					in >> dummy >> wss;
				}
				continue;
			} else {
				in >> t >> wss;
				break;
			}
		}
		uint bases = sequence.length();
		if (bases > data.size()) { 
			this->resize(bases - 1); 	// order is actually bases-1
		}
		data[bases-1][sequence2index(sequence)] = t;

	}
}

template <typename T> 
void MM<T>::write(ostream &out) const {

	for (uint order=0; order<data.size(); order++) { 
		
		out << MM_COMMENT << " --- Order " << order << ": ---" << endl;
		unsigned long R = MM::requirement(order);

		for (uint index=0; index < R; index++) { 
			out << index2sequence(index, order+1) << "\t" 
			    << setprecision(MM_PRECISION) 
				<< data[order][index]
				<< endl;
		}
	}
}


template <typename T> 
void MM<T>::pseudocount(const T &pc) { 
	for (uint i=0; i<data.size(); i++) { 
		unsigned long R = requirement(i);
		for (unsigned long index=0; index < R; index++) { 
			data[i][index] += pc;
		}
	}
}

template <typename T>
void MM<T>::add(const string &sequence, const T &weight, uint start, uint end) {

	if (end > sequence.length()) { end = sequence.length(); }

	// for each *last* position
	uint lookahead = 0;
	unsigned long index = 0;
	for (uint i=start; i<end; i++) {

		const uint DNA_ID = dna_id(sequence[i]);

		if (DNA_ID < DNA_ABLEN) { 

			index = (index << 2);	// 2 bits/base
			index |= dna_id(sequence[i]);
			for (uint L=0; L<=lookahead; L++) { 
				data[L][index & MM::mask(L+1)] += weight;
			}

			if (lookahead < (data.size()-1)) { lookahead++; }

		} else {

			// Can't handle non-dna (I don't keep track of every combination
			//	of the last `order' characters).  Just count non-DNA as a break.

			lookahead = 0;
			index = 0;

			#if MARKOV_DEBUG
			if (iupac_id(sequence[i]) == IUPAC_ABLEN) { 
				cerr << "markov.h:  MM::add(...):  Unknown character in sequence '"
					 << sequence[i] << "' (character " << ((int)sequence[i]) << " at "
					 << "position " << i << ")." << endl;
			}
			#endif
			
		}
	}
	return;
}



template <typename T>
void MM<T>::normalize() { 

	for (uint i=0; i<data.size(); i++) { 
		unsigned long R = MM::requirement(i);
		for (uint index=0; index<R; index += DNA_ABLEN) {
			T sum = 0;
			for (uint b=0; b<DNA_ABLEN; b++) { sum += data[i][index+b]; }
			if (sum > 0) { 
				for (uint b=0; b<DNA_ABLEN; b++) { data[i][index+b] /= sum; }
			}
		}
	}
	return;
}





	
	
template <typename T>	
T MM<T>::likelihood(const string &sequence, uint start, uint end) const {

	/*
	To update a number:

	mask = ~(-1 << 2*(order+1))  		// order+1 because order is PREVIOUS bases, +1 for the base in question
										// shift -1 because we want all the bits that are part of that number
										// complement because shifting -1 gives you 11...10000, we want 00...01111
			
	number = (number << 2) & mask;		// shift OUT the old least-recent-base, make room for the next

	number = number | new-base-digits
	*/

	if (end > sequence.length()) { end = sequence.length(); }

	#if MARKOV_DEBUG
	if (!over_dna(sequence)) { cerr << "MM::likelihood:  WARNING:  sequence uses non-DNA alphabet." << endl; }
	#endif

	const unsigned long mask = MM::mask(data.size());
	uint lookahead= 0;	// order to use (start at 0, no letters so far)
	unsigned long index = 0;
	T ans = 1;
	
	for (uint i=start; i<end; i++) { 

		const uint DNA_ID = dna_id(sequence[i]);

		if (DNA_ID < DNA_ABLEN) { 

			// update index
			index = (index << 2) & mask;
			index = index | dna_id(sequence[i]);

			ans *= data[lookahead][index];
		
			if (ans==0) { return ans; }	// moot
			if (lookahead < data.size()-1 ) { lookahead++; }	// we can handler larger order 

		} else {

			// treat non-DNA as a break
			
			#if MARKOV_DEBUG
			cerr << "markov.h:  MM::likelihood(...):  WARNING:  Illegal character '" 
			     << sequence[i] << "'." << endl;
			#endif

			lookahead = 0;
			index = 0;

		}
		
	}
	return ans;
}


		
template <typename T>
unsigned long MM<T>::sequence2index(const string &sequence, uint start, uint end) { 
	
	if (end > sequence.length()) { end = sequence.length(); }

	const unsigned long mask = MM::mask(sequence.length());
	unsigned long index = 0;
	for (uint i=start; i<end; i++) { 
		
		index = (index << 2);
		index |= dna_id(sequence[i]);
		index &= mask;
	
	}

	return index;
}

template <typename T>
string MM<T>::index2sequence(unsigned long index, uint bases) { 
	string ans = "";
	for (uint i=0; i<bases; i++) { 
		ans = DNA_ALPHABET.substr(index&3, 1) + ans;
		index = (index >> 2);	// eat 2 bases
	}
	return ans;
}


#endif // MARKOV_H
