/*
	Create N-th Order Markov Model File as:

	# comment
	AGCT	0.002341289342923

	(i.e. Sequence followed by frequency)

*/


#include <stdlib.h>
#include <math.h>

#include <fstream>
#include <sstream>
#include <iostream>
#include <stdlib.h>
#include <list>
#include <string>
using namespace std;

#include "fasta.h"
#include "markov.h"
#include "Option.h"

/** populate counts in markov data return relative position (start of char
 * sequence is 0) of one after last position processed.  If a nonstandard base
 * is found (i.e. not {AGCT}), stop counting and exit.  Caller may resume from
 * position returned.
 */
int populate_counts(const char *S, int N, vector<vector<long double> > &mm) {

	const int ORDER = (int)(mm.size()) - 1;	// size==3 => orders 0,1,2 so mm order = 2

	// for each position, p, look at previous 'order' bases.
	for (int p=0; p<N; p++) { 
		
		for (int order=0; order<=ORDER; order++) { 
			if (p < order) { break; }	// cannot go back to before 'S'
			int offset = markov_offset(S + p - order, order + 1);
			if (offset < 0) {
				// special case:  p contains nonstandard base.
				return p+1;	// return one after last position finished.
			}
			mm[order][offset] += 1.0;
		}
	}
	return N;	// return one after last position finished.
}

int main(int argc, char **argv) { 
	
	int order;
	double pseudocount;
	bool normalize;
	
	ostringstream synopsis;
	synopsis << "Create Markov model of DNA distribution from sequence data.  "
	         << "Output format:  Character sequences followed by frequency "
			 << "of occurance.  Lines preceeded with '#' are comments.";
	OptionParser parser(synopsis.str());
	parser.add("order", 'o', &order, 0, "Order of the Markov model");
	parser.add("pcount", 'p', &pseudocount, 0.0, "Pseudocount (add this many to each instance)");
	parser.add("nonorm", 'k', &normalize, true, "Do not normalize counts");

	vector<string> args = parser.parse(argc, argv, " < FASTA file");

	list<pair<string,string> > data;
	if (args.size()) { 
		ifstream fin(args[0].c_str());
		if (!fin) { cerr << "Can't open " << args[0] << endl;  exit(-1); }
		while (fin.good()) { 
			data.push_back(pair<string,string>("",""));
			fasta_read(fin, data.back().first, data.back().second);
		}
		fin.close();
	} else {
		while (cin.good()) { 
			data.push_back(pair<string,string>("",""));
			fasta_read(cin, data.back().first, data.back().second);
		}
	}

	// create empty Markov model data
	vector<vector<long double> > mm(order+1);
	for (int i=0; i<((int)(mm.size())); i++) { 
		mm[i].resize( (unsigned int)(pow((double)ABLEN, (double)(i+1))) );
		std::fill(mm[i].begin(), mm[i].end(), pseudocount);
	}

	for (list<pair<string,string> >::const_iterator i=data.begin(); i!=data.end(); i++) {
	
		const char *sequence = i->second.c_str();

		int p = 0;
		int N = i->second.length();
		do { 
			p = populate_counts(sequence+p, N, mm);
			N -= p;
		} while (N > 0);

	}

	// normalize mm
	if (normalize) {
		for (int mo=0; mo<=order; mo++) { markov_normalize(mm, mo); }
	}

	// write mm
	markov_write(cout, mm);

	return 0;
}
