/**
 * gs:  generate (Gaussian mixture) states (from expression data)
 *
 *	input:  expression file or GRN file (see FILE_FORMATS.README)
 *
 *	output:  GaussianMixture parameters for all genes (see "states" file format)
 *
 */

#include "gaussian.h"
#include "GMGenerator.h"
#include "mixture_constraints.h"
#include "OptionParser.h"
#include "datafile.h"

#include <stdlib.h>

#include <iostream>
#include <fstream>
#include <vector>
using namespace std;
	
template <typename _Ditty> // _Ditty = Double Iterator
GaussianMixture generate(_Ditty begin, _Ditty end,
                         double K, double min_stdev, double min_weight) { 

	GMGenerator gmg;	// use all default parameters
	GaussianMixture ans = gmg.generate(begin, end, 
	                                   GRN_GaussianMixtureConstraints(K, min_stdev, min_weight));
	return ans;
}


int main(int argc, char **argv) { 

	int seed;

	double K, min_weight, min_stdev;

	string output_file, assay_file, xpr_file;

	bool verbose;

	OptionParser parser("Generate states.  Pre-process a set of expression data into a set of Gaussian Mixture Models, using pre-defined constraints on mixtures (e.g. minimum variance and weight)"); 

	parser.add(Option("seed", 's', &seed, 8174, "random number seed"));
	parser.add(Option("minK", 'K', &K , 2.0, 
		"number of standard distrbutions from which each Gaussian must be separated"));
	parser.add(Option("minSigma", 'g', &min_stdev, 0.1, 
		"minimum standard distribution of any Gaussian in the predicted mixture"));
	parser.add(Option("minW", 'w', &min_weight, 0.1, 
		"minimum weight of any Gaussian in the predicted mixture"));

	parser.add(Option("assay", 'a', &assay_file, "", "Assay data input file:  first line has number-of-arrays number-of-conditions number-of-genes, second line is conditions-names and genes-names (column header), rest of lines are array-name followed by variable values (conditions values or genes expression)"));

	parser.add(Option("expression", 'x', &xpr_file, "", "Expression input file:  first line is column headers (gene names), rest of records are array-name followed by expression for each gene"));

	parser.add(Option("verbose", 'v', &verbose, false, "Verbose output"));
	
	parser.add(Option("output", 'o', &output_file, "", "output file (write to stdout otherwise)"));
	
	vector<string> args = parser.parse(argc, argv);

	srand(seed);

	if (output_file != "") {
		// test output file.
		ofstream test_outfile(output_file.c_str());
		if (!test_outfile) { 
			cerr << "Could not open " << output_file << " for write." << endl;
			exit(-1);
		} else {
			test_outfile.close();
		}
	}
	

	list<pair<string, GaussianMixture> > states;

	if (xpr_file != "" && assay_file == "") {	// changed to give preference to assay file
		// read from expression file
		
		//	void read_expression(string filename, vector<string> &header, 
		//		vector<pair<string, vector<double> > > &expression, ostream &err);

		vector<string> header;
		vector<pair<string, vector<double> > > expression;

		if (verbose) { cerr << "generate states from " << xpr_file << endl; }
		if (!read_expression(xpr_file, header, expression, cerr)) { exit(-1); }

		int N = expression.size();	// number of arrays
		int G = header.size();	// number of genes

		for (int g=0; g<G; g++) { 

			string name = header[g];
			vector<double> data(N);
			for (int a=0; a<N; a++) { data[a] = expression[a].second[g]; }

			if (verbose) {
				int percent = int(100.0 * ((double)g / G));
				cerr << name << " " << percent << "%\t\r";
			}
			
			GaussianMixture gm = generate(data.begin(), data.end(), K, min_stdev, min_weight);
			states.push_back(pair<string,GaussianMixture>(name, gm));
		}
		if (verbose) { cerr << "100%                            " << endl; }
		

	} else if (assay_file != "") {
		// read from assay file

		int N,C,G;
		vector<string> arrays;
		vector<string> conditions;
		vector<string> genes;
		vector<vector<string> > values;
		vector<vector<double> > expression;
		if (verbose) { cerr << "generate states from " << assay_file << endl; }
		
		if (!read_assay_data(assay_file, N, C, G, arrays, conditions, genes, values, expression, cerr)) { exit(-1); }

		for (int g=0; g<G; g++) { 

			string name = genes[g];
			vector<double> data(N);
			for (int a=0; a<N; a++) { data[a] = expression[a][g]; }

			if (verbose) {
				int percent = int(100.0 * ((double)g / G));
				cerr << name << " " << percent << "%    \r";
			}
			GaussianMixture gm = generate(data.begin(), data.end(), K, min_stdev, min_weight);
			states.push_back(pair<string,GaussianMixture>(name, gm));
		}
		if (verbose) { cerr << "100%         " << endl; }

	} else {
		// 
		cerr << "Malfunction.  Need input file (-x or -a)" << endl;
		parser.usage(cerr);
		exit(-1);

	}

	// now print output

	if (output_file != "") { 
		// use filename
		ofstream outfile(output_file.c_str());
		if (!outfile) { 
			cerr << "Could not open " << output_file << " for write." << endl;
		} else {
			for (list<pair<string, GaussianMixture> >::const_iterator i=states.begin();
			                                                          i!=states.end(); i++) {
				int M = i->second.size();
				outfile << i->first << "\t"
				        << M << "\t";
				for (int m=0; m<M; m++) { 
					outfile << i->second[m].mu << "\t"
					        << i->second[m].sigma << "\t"
					        << i->second[m].w << "\t";
				}
				outfile << endl;
			}
			outfile.close();
		}
	} else {
		// stdout
		for (list<pair<string, GaussianMixture> >::const_iterator i=states.begin();
			                                                      i!=states.end(); i++) {
			int M = i->second.size();
			cout << i->first << "\t"
				 << M << "\t";
			for (int m=0; m<M; m++) { 
				cout << i->second[m].mu << "\t"
					 << i->second[m].sigma << "\t"
					 << i->second[m].w << "\t";
			}
			cout << endl;
		}
	}
		
	return 0;
} // main

