#include <iostream>
#include <iomanip>
#include <fstream>
#include <iterator>
#include <vector>
#include <list>
#include <map>
#include <time.h>
using namespace std;

#include "CRM.h"			// the CRM learner
#include "genomic.h"		// constants and standards
#include "grid.h"			// pretty-printing 2-D information	
#include "Timer.h"			// a simple timer
#include "markov.h"			// simple sequence Markov model functionality
#include "Option.h"			// My option parser and help file generator
#include "fasta.h"			// FASTA file format functions, fool.
#include "probability.h"	// defines probability type
#include "IOExample.h"		// defines the kind of readable example that I use 
							//	(since sequence data take up space, I don't want to copy them)
#include "metric.h"			// defines scoring metric
#include "custom.h"			// defines any additional constraints on the model

const string PROGRAM_VERSION = "Structured CRM 2.2";
	//
	//	2.0:	Generative mode complete.
	//	2.1:	Krogh's algorithm used 
	//	2.2:	Powerset:  Now all paths are considered (2006-07-24)
	//

const uint TRAIN=0, TUNE=1, TEST=2, NUM_SETS=3;

/** Verbosity level:
	V_MIN:  minimal.  program options and results.
	V_IM:  intermediate.  tell me what you're doing, but keep it brief.
	V_MAX:  maximal.  tell me everything (except dumping distance distributions)
 */
const uint V_MIN=0, V_IM=1, V_MAX=2;

/** GlobalParameters is a special struct of program options
	all functions have access to the const pointer.
 */
struct GlobalParameters;	
struct GlobalParameters : public OptionParser {

  public:

	string background_file;	// arbitrary background distribution for each input sequence
	string mm_file;			// Markov chain background model file
	string cm_file;			// candidate motifs (FASTA)
	string distance_file;	// output distance distriubtion(s) file
	string prior_file;		// prior probabilities over binding site locations per sequence
	string weights_file;	// sequence weights
	
	uint emi1, emi2;	// Krogh update iterations (two cases 1==1 binding site, 2==2+ binding sites)
	
	uint C,D,G;

	uint K,Kseeds;
	uint theory_iterations;
	
	uint crmlim;
	
	uint minw, maxw;
	bool projection;
	
	uint folds, fold;
	double tunerat;

	int seed;
	uint verbose;	// Basically, 0 = shut up, 1 = parameters, 2 = what you're doing, 3 = pretty much everything, 4+ = REALLY everything

	bool generative;			// Use a strictly generative algorithm when training (but not when choosing models)
	bool twoworlds;				// Make the "two worlds" assumption:  Either a sequence contains all binding site or none (background only).
								//	NOTE:  in the case of negated binding sites, this is a "three worlds" assumption.
	probability eta;			// learning rate

	bool reweight_negatives;	// in learn(...), reweight all negative examples so the total presence equals the number of positives
	probability pseudocount;	// standard per-sequence pseudocount to add to all parameters
	bool uniform_distance;		// should the distance distribution always be uniform?
	bool uniform_strand;		// ..
	bool uniform_order;			// ..
	bool uniform_motif;			// .. motif preference
	
	double ksmooth_stdevs;		// Smoothing Kernel HALF (!) size (in std. deviations, will be symetrical, i.e. 2*H+1)
	double ksmooth_max_stdev;	// maximum std. dev. for smoothing (on unit scale)

	string comment;

	string crm_load_filename;	// read CRM from this file (instead of learning)
	string crm_save_filename;	// write learned CRM to this file
	string crm_base_filename;	// read CRM from this file, use as 'base' for all learned models.
	bool base_fixed;			// is the CRM base model fixed (i.e. don't adjust parameters)?

	GlobalParameters(string synopsis) 
	: OptionParser(synopsis)
	{

		// Some CRM parameters are defined (initialized) here instead of being
		//	set by the user via ``add(longname, flag, pointer, default, help).''
		
		// HEY DUMMY!  If you remove an option (comment out the call to ``and(...)''), 
		//	don't forget to initialize the variable (don't leave it as NaN, it's not nice).

		CRM::MINIMUM_IO_ITERATIONS = 1;
		CRM::MAXIMUM_IO_ITERATIONS = (uint)-1; 	// this is per position (max i and, seperately, max j)
		theory_iterations = 1; 					// add("thits", 'i', &theory_iterations, 2, "Number of times to re-iterate main loop with best current working theory");
		
		CRM::MDA = 1.0;							//	add("MDA", 'A', &CRM::MDA, 1.0, "Max. Distance Area:  Evaluate this much of the distance distribution");
		ksmooth_stdevs = 2; 
		projection = true; 						// add("norp", NULL_FLAG, &projection, true, "Seed new motifs randomly (no projection)");
		reweight_negatives = true;

		add("comment", 'X', &comment, "", "Comment (ignored).");

		add("bg", 'b', &background_file, "", "Arbitrary background distribution per sequence.  FASTA-like format:  \
		                                     	>unique-sequence-ID [rest of line is ignored].  \
												Following i numbers are the LOG-SCALE likelihood of the subsequence \
												up to, but not including position i (so there are L+1 numbers \
												for a sequence of length L).");
		add("mm", 'm', &mm_file, "", "Background distribution Markov chain model file \
										(subsequence followed by probability, e.g. \"CGTA    0.09141979\").");
		add("candidate", 'c', &cm_file, "", "Candidate motif file (consensus sequences in FASTA format).  \
											 These single-motifs are considered, and structure may be \
											 added to them, but they are not guaranteed to be part of \
											 the final model.");
		add("weights", 'W', &weights_file, "", "Sequence weights:  Probability a given sequence is positive \
													(FASTA-like format:  \
		                                        	>unique-sequence-ID [rest of line is ignored].  \
													Following line contains a single number which is \
													the weight of this sequence).");
		add("prior", 'V', &prior_file, "", "Prior distribution of binding site locations over sequence \
		                                    	(FASTA-like format:  >unique-sequence-ID [rest of \
												line is ignored].  Following lines contain real numbers, \
												one per character in the cooresponding sequence \
												(matched by the unique ID string).");

		add("base", NULL_FLAG, &crm_base_filename, "", "Base all hypothesis CRM models on this submodel");
		add("badjust", 'j', &base_fixed, true, "Allow parameters of base model to be adjusted during learning");
		add("load", NULL_FLAG, &crm_load_filename, "", "Read CRM from this file (instead of learning the model)");
		add("save", NULL_FLAG, &crm_save_filename, "", "Write learned CRM to this file");
		add("fileprec", NULL_FLAG, &CRM::FPP, 10, "Parameter (probability) precision in stored files (see --save)");
		add("dfile", 'f', &distance_file, "", "Dump distance distribution(s) data to this file.");
	
		add("C", 'C', &C, 3, "Maximum number of binding sites (i.e. conjuncts).");
		add("D", 'D', &D, 3, "Maximum number of motifs/binding site (i.e. motif disjuncts for each binding site).");
		add("G", 'G', &G, 1, "Maximum number of negated binding sites \
		                      	(this is also limited by the maximum number of binding sites).");

		add("d", 'd', &uniform_distance, false, "Always use a uniform distance distribution.");
		add("o", 'o', &uniform_order, false, "Always use a uniform order distribution.");
		add("s", 's', &uniform_strand, false, "Always use a uniform strand distribution.");
		add("unior", 'u', &uniform_motif, false, "Force a uniform probability distribution over disjunctions.  \
		                                     	Binding sites are represented by a disjunction of \
												motifs (OR logic).  By default, the algorithm learns the \
												probability of each, but this option forces that distribution to be \
												uniform.");
		
		add("minw", 'w', &minw,  8, "Minimum motif Width.  The learner will alter model structure \
		                             	by adding new motifs at least this wide.");
		add("maxw", 'x', &maxw, 15, "MaXimum motif width.  The learner will add new motifs at most this wide.");


		eta = 1.0; 	// add("eta", 'n', &eta, 1.0, "Learning rate for discriminative training");
		
		K=100;	//add("K", 'K', &K, 10, "Beam width");
		
		add("gen", 'g', &generative, false, "Use generative EM to learn parameters from positive examples (does not apply to choosing models)");
		add("2path", '2', &twoworlds, false, "Restrict the set of possible HMM paths to two:  All binding sites or no binding sites (background only). \
											In the case of negated CRMs, there will be three paths:  All binding sites, all positive binding sites \
											and no binding sites.");
		
		add("emi1", 'e', &emi1, 10, "For CRM models with one binding site \
		                             	(running time O(n) in sequence length; relatively fast), \
										the number of E-M (Krogh) update iterations.");
		add("emi2+", 'E', &emi2, 10, "For CRM models with more than one binding site \
										(running time O(n^2) in sequence length; slower), \
										the number of the number of E-M (Krogh) update iterations.");
		
		add("ratio1", 'r', &CRM::magic_ratio1, 0.10, "For CRM models with one binding site \
		                                              	(running time O(n) in sequence length; relatively fast), \
														the amount of probability mass to examine over possible \
														motif locations.");
		add("ratio2+", 'R', &CRM::magic_ratio2, 0.10, "For CRM models with more than one binding site \
													  	(running time O(n^2) in sequence length; slower), \
														the amount of probability mass to examine over \
														possible motif locations.");
		
		//add("pseduocount", 'p', &pseudocount, 0.001, "Per-sequence pseudocount to smooth CRMs.");
		add("pseduocount", 'p', &pseudocount, 0, "Per-sequence pseudocount to smooth CRMs.");
		add("SMSD", 'k', &ksmooth_max_stdev, 0.1, "Maximum kernel standard deviation for distance smoothing \
													(unit scale).");
		// ksmooth_max_stdev = 0.1;	// maximum std. dev. for smoothing (on unit scale)

		add("binw", 'B', &CRM::BINW, 1, "Distance histogram bin width (in base pairs).");
		//add("beta", NULL_CHAR, &CRM::GammaBeta, 0, "Beta parameter for initial distance distribution (A Gamma distribution with alpha=2.0, beta=user-defined).  If set to zero, use a uniform distribution.");
		
				
		add("init", 'I', &Kseeds, 10, "Number of independently-learned, single-motif CRM models to initalize beam.");
		add("timeout", 'Z', &crmlim, 50, "Stopping criteria:  Maximum number of explored models.");

		add("folds", 'N', &folds, 10, "Number of cross-fold validation folds (see `fold' option).");
		add("fold", 'F', &fold, 10, "Current test fold.  To hold aside examples for testing, \
		                             	set this between 0 and N-1 (where `N' is the number of folds--see \
										`folds' option).  A sequence will be held aside for testing \
										if, and only if, sequence index number (zero origin, positive sequences first, \
										then negative) modulo `folds' = `fold'.");
		add("evratio", 'T', &tunerat, 0, "Ratio of training examples to hold aside for evaluation only \
											(otherwise use whole trainset for evaluation).");

		add("seed", 'S', &seed, -1, "PRNG seed (if negative, use system time).");  

		ostringstream verbosity_help_oss;
		verbosity_help_oss << "Verbosity level, " << V_MIN << " (conservative) to " << V_MAX << " (ridiculous).";
		#if CRM_DEBUG
		add("verbose", 'v', &verbose, V_MAX, verbosity_help_oss.str());
		#else
		add("verbose", 'v', &verbose, V_IM, verbosity_help_oss.str());
		#endif

	}

};
const GlobalParameters *parameter;	// just look at this to see the program's user-given 
									//	parameters instead of passing them everywhere





/** Subroutine:
 *
 *	Print Class vs. Prediction grid
 *
 */
void print_classification_grid(const vector<pair<probability, probability> > &cp, ostream &out) {

	// count TP, FP, TN, FN

	double TP=0,FP=0,TN=0,FN=0;

	for (vector<pair<probability, probability> >::const_iterator i=cp.begin(); i!=cp.end(); i++) { 
		//       Class        Prediction
		TP += (i->first) * (i->second);		//  C &  P
		FP += (1-i->first) * (i->second);	// ~C &  P
		TN += (1-i->first) * (1-i->second);	// ~C & ~P
		FN += (i->first) * (1-i->second);	//  C & ~P
	}

	// print grid

	// void print_classification_grid(double TP, double FP, double TN, double FN, ostream&);
	print_classification_grid(TP, FP, TN, FN, out);

	return;
}




/** 
 * Subroutine:
 *
 * Read a background MARKOV file and return the NORMALIZED model
 *	(MM = vector<vector<double>>)
 *
 * Use a UNIFORM 0th-order model if the filename is empty.
 *
 * Complain and EXIT if the filename is bad
 *
 */
MM<probability> read_markov_model() {

	Timer timer;
	const string filename = parameter->mm_file;

	MM<probability> bg;

	if (filename != "") { 

		ifstream fin(filename.c_str());
		if (!fin) { 
			cerr << "Bad Markov file:  " << filename << endl;
			exit(-1);
		}
		bg.read(fin);
		fin.close();
		if (parameter->verbose) { 
			cerr << "Read order " << bg.order() << " Markov file \"" << filename << "\".";
			cerr << " (" << timer << ")" << endl;
		}
		bg.normalize();

		// check for zeros and print warnings (these are bad)
		for (uint order=0; order<=bg.order(); order++) {
			bool impossible = false;
			for (const probability *i = bg.begin(order); i!=bg.end(order); i++) { 
				if (*i == 0) {
					cerr << endl
						 << "Warning:  At least one impossible subsequence " 
						 << MM<probability>::index2sequence( i - bg.begin(order) , order+1 )
						 << " in Markov model '" << filename << "'"
						 << endl << endl;
					impossible = true;
					break;	// only print one warning
				}
			}
			if (impossible) { break; }	// only print one warning
		}
	
	} else {

		bg.resize(0);
		bg.fill(0);	
		bg.pseudocount();
		bg.normalize();
		if (parameter->verbose) {
			cerr << "No background distribution file given.  Using 0th order uniform distribution.";
			cerr << " (" << timer << ")" << endl;
		}

	}
	return bg;

}


/** Subroutine:
 *
 *	Read sequence data from file -> EXAMPLE vector(s)
 *
 *	1) Read a sequence
 *	2) Store as a *new* IOExample (in ioexamples vector)
 *	3) copy pointer to examples[train/test set][pos/neg example] vector
 *	4) repeat
 *
 */
void read_sequence(istream &in, uint &sid, const probability &default_weight, 
					vector<vector<Example*> > &examples, 	// train/tune/test
			   		vector<IOExample*> &ioexamples) {		// all allocated data

	// assume stream is okay
	IOExample *iox = new IOExample();
	IOExampleCode ioxcode = iox->read(in, parameter->maxw, default_weight);

	if (ioxcode != IOExSUCCESS) { 
		if (ioxcode == IOExSHORT_SEQ) {
			cerr << "Cannot have input sequence shorter than maximum motif width (" 
				 << parameter->minw << ")." << endl; 
		}
		cerr << "Error reading sequence data." << endl;
		exit(-1);
	}
	ioexamples.push_back(iox);

	int t = (sid % parameter->folds == parameter->fold) ? TEST : TRAIN;

	examples[t].push_back(ioexamples.back());	// copy pointer to appropriate set, train or test

	return;

}



/**  
 * Subroutine:  set `x' so that it points to the same example
 *	as named by the next header in this input (`in').
 *	return false iff there is no such example (with that name)
 *
bool pseudofasta_find_example(istream &in, vector<IOExample*> &ioexamples,
								string &name, vector<IOExample*>::iterator &x, 
								string &error_string) {


	// read header
	char c;
	in.get(c);
	if (c != '>') {
		error_string = "expected '>' header";
		return false;
	}
	in.get(c); while (in.good() && c < ' ') { in.get(c); }	// read white space;
	in >> name;
	in.get(c); while (in.good() && c != '\n') { in.get(c); }	// read until EOL

	// okay, instream is good to go, now find sequence with 'name'

	if (x != ioexamples.end() && (*x)->name() == name) { 
		
		// cool, next sequence is the right one.  
		
	} else {
		
		// now I have to search for the sequence
		x = ioexamples.begin();
		while (x != ioexamples.end() && (*x)->name() != name) { x++; }
		if (x == ioexamples.end()) { 
			
			// read until next header or EOF
			while (in.good() && in.peek() != '>') { char c; in.get(c); }
			
			ostringstream oss;
			oss << "No such sequence as \"" << name << "\"";
			error_string = oss.str();
			
			return false;
		}
	}

	return true;
}*/


/** 
 * A FASTA header is "> UNIQUE-ID ...other stuff... <end-of-line>".  This
 * parses out the UNIQUE-ID.
 */

string fasta_header2id(const string &header) {

	istringstream iss(header);

	string id;

	iss >> id;

	if (id== ">") { 
		// just a '>', keep going
		iss >> id;
	} else if (id.substr(0,1) == ">") { 
		id = id.substr(1);	// remove ">"
	}

	return id;

}



/** 
 *	Subroutine:  Read a "psuedofasta" file.  That's a file with
 *	fasta headers, and data corresponding to a each of a set of sequences.
 *
 *	return:  mapping:  sequence ID -> vector of data
 *	return:  true iff success
 */
template <typename T>
bool read_pseudofasta(const string &filename, const string &type, map<string,vector<T> > &mapping) {

	Timer timer;
	
	if (parameter->verbose >= V_MAX) { cerr << "Reading \"" << filename << "\"...  " << endl; }
	
	ifstream fin(filename.c_str());
	if (!fin) { 
		cerr << "Failed to open " << type << " file \"" << filename << "\"." << endl; 
		return false;
	}

	char c;

	while (fin.good()) {
		
		// read white space
		fin >> ws;
		
		// read header (to end-of-line)
		if (fin.peek() != '>') { cerr << "Error reading \"" << filename << "\":  Expected '>'" << endl;  return false; }
		string header = "";
		fin.get(c);
		while (fin.good() && c != '\n') { header += c; fin.get(c); }
		string ID = fasta_header2id(header);

		// read data
		mapping[ID].clear();
		while (fin.good()) { 
			if (fin.peek() == '>') { break; }	// another header is next!
			T t;
			fin >> t >> ws;	// read a data unit (T)
			mapping[ID].push_back(t);
		}

	}

	fin.close();
	
	if (parameter->verbose >= V_IM) { cerr << "Read " << type << " file \"" << filename << "\" (" << timer << ")" << endl; }

	return true;

} // read_pseudofasta
			
		







/** Subroutine:
 *
 *	Read sequence data from file -> EXAMPLE vector(s)
 *
 *
 */
void read_data(const vector<string> &args, 
               vector<vector<Example*> > &examples, 				// examples[set][sequenceID]
			   vector<IOExample*> &ioexamples) {					// all allocated data

	// READ SEQUENCE DATA
	uint sid = 0;
	for (uint f=0; f<args.size(); f++) { 
		Timer timer;
		bool positive_input = args.size()==1 || f < args.size()-1;	// input is positive if only one file given,
																	//	or this is not the last input file 
																	//	(i.e. all files positive except last)
		probability default_weight = positive_input ? 1.0 : 0.0;
		ifstream fin(args[f].c_str());
		if (!fin) { cerr << "Could not open FASTA sequence data file \"" << args[f] << "\"." << endl;  exit(-1); }
		if (parameter->verbose >= V_MAX) { cerr << "Reading \"" << args[f] << "\"..." << endl;  } 
		for (; fin.good(); sid++) { 
			fin >> ws;	
			if (!fin.good()) { break; }
			if (parameter->verbose >= V_MAX) { cerr << sid << "\r"; } //  (too fast)
			read_sequence(fin, sid, default_weight, examples, ioexamples);
		}
		if (parameter->verbose >= V_IM) { cerr << "Read sequence data file \"" << args[f] << "\" (" << timer << ")" << endl; }
		fin.close();
	}


	// If given, read BACKGROUND probability file
	if (parameter->background_file != "") { 
		
		// 2006-07-27, consider values log-scale, so read in a map
		// string->(vector of logscales), then store in a map string->(vector
		// of probabilities) where the probabilities were properly initialized
		// directly from the log-scale values (not translated to linear scale
		// where they may lose their values).
		map<string, vector<long double> > x2bg_logscale;
		bool success = read_pseudofasta<long double>(parameter->background_file, 
		                                             "background subsequence", 
													 x2bg_logscale);

		map<string, vector<probability> > x2bg;	
		for (map<string, vector<long double> >::const_iterator i=x2bg_logscale.begin(); i!=x2bg_logscale.end(); i++) { 
			for (vector<long double>::const_iterator j=i->second.begin(); j!=i->second.end(); j++) { 
				x2bg[i->first].push_back( interpret_logscale_probability(*j) );
			}
		}

		if (!success) { exit(-1); }	// error already announced to cerr


		for (vector<IOExample*>::const_iterator iox=ioexamples.begin(); iox!=ioexamples.end(); iox++) { 

			if (x2bg.find((*iox)->name()) != x2bg.end()) { 
				// x2bg has key
				vector<probability> *d = &(x2bg[(*iox)->name()]);
				IOExampleCode code = (*iox)->set_background(*d);
				if (code != IOExSUCCESS) { 
					cerr << "ERROR:  Error reading background distribution data for sequence \""
						 << ((*iox)->name()) << "\"" << endl;
					if (code == IOExINCOMPATABLE) { cerr << "Incorrect data length." << endl; }
					exit(-1);
				}
			} else {
				cerr << "ERROR:  No background distribution given for sequence \"" << ((*iox)->name()) << "\"" << endl;
				exit(-1);
			}
		}
	
	} else {

		// no background given.  use a Markov model
		MM<probability> mm = read_markov_model();	// (will create default model if no Markov filename given)

		for (vector<IOExample*>::const_iterator iox=ioexamples.begin(); iox!=ioexamples.end(); iox++) { 

			IOExampleCode code = (*iox)->set_background(mm);

			if (code != IOExSUCCESS) { 
				cerr << "Error applying Markov chain model to sequence "
					 << ((*iox)->name()) << "." << endl;
				exit(-1);
			}
		}

	}


	// If given, read PRIOR file
	if (parameter->prior_file != "") { 
		
		map<string, vector<probability> > x2priors;
		bool success = read_pseudofasta<probability>(parameter->prior_file, 
		                                             "prior location distribution",
													 x2priors);
		if (!success) { exit(-1); }	// error already announced to cerr
	
		for (vector<IOExample*>::const_iterator iox=ioexamples.begin(); iox!=ioexamples.end(); iox++) { 

			const string id = (*iox)->name();
			if (x2priors.find(id) != x2priors.end()) { 
				// has key
				vector<probability> *d = &(x2priors[id]);
				#if CRM_DEBUG
				for (uint p=0; p<d->size(); p++) { assert(!isnan((*d)[p])); }
				#endif
				(*iox)->add_prior(d->begin(), d->end());
			} else { cerr << "Warning:  No prior distriubtion for sequence \"" << id << "\"" << endl; }

		} // next example

	}


	// if given, read 'weights' file
	if (parameter->weights_file != "") { 
	
		map<string, vector<probability> > x2weight;
		bool success = read_pseudofasta<probability>(parameter->weights_file, 
		                                             "sequence weights", 
													 x2weight);
		if (!success) { exit(-1); }	// error already announced to cerr
	
		for (vector<IOExample*>::const_iterator iox=ioexamples.begin(); iox!=ioexamples.end(); iox++) { 

			const string id = (*iox)->name();
			if (x2weight.find(id) != x2weight.end()) {
				// has key
				(*iox)->reweight( x2weight[id].front() );	// one weight only
			
				if (x2weight[id].size() != 1) { 
				cerr << "Warning:  " << x2weight[id].size() << " entries for sequence \""
					 << id << "\" weight." << endl;
				}

			} else { cerr << "Warning:  No prior weight for sequence \"" << id << "\"" << endl; }

		} // next example
	}


	// divide Trainset into Train', Tune set
	if (parameter->tunerat > 0) { 
		
		vector<Example*> new_trainset;

		for (uint i=0; i<examples[TRAIN].size(); i++) { 
			
			double r = (double)(examples[TUNE].size()) / (new_trainset.size()+examples[TUNE].size());
			if (r < parameter->tunerat) { 
				// Tuneset / Total ratio is a little smaller than it should be
				examples[TUNE].push_back(examples[TRAIN][i]);
			} else {
				new_trainset.push_back(examples[TRAIN][i]);
			}
		}
		examples[TRAIN].resize(new_trainset.size());
		std::copy(new_trainset.begin(), new_trainset.end(), examples[TRAIN].begin());
	}

	/*
	REPLACED by `datastats'
	for (uint t=0; t<NUM_SETS; t++) { 

		cerr << (t==TRAIN ? "TRAINSET:  " : t==TUNE ? "TUNESET:   " : "TESTSET:   ");

		double P=0, N=0;
		for (vector<Example*>::const_iterator i=examples[t].begin(); i!=examples[t].end(); i++) { 
			P += (*i)->weight();
			N += 1 - (*i)->weight();
		}

		cerr << examples[t].size() << " sequences (" << P << "+ " << N << "- )" << endl;

	}
	*/
		
	return;

}






/** return the first "S" of a randomized list of integers 0 through N-1 */
vector<uint> random_integers(uint N, uint S) {
	vector<pair<double, uint> > L;
	for (uint i=0; i<N || i<S; i++) { L.push_back(pair<double,uint>( ((double)rand()/RAND_MAX) , i )); }
	std::sort(L.begin(), L.end());
	vector<uint> ans(S);
	for (uint i=0; i<S; i++) { ans[i] = L[i].second; }
	return ans;
}

	                                                               
/** 
 *
 *	Subroutine:
 *	
 *	Use random projection to create a random motif 
 *
 */
PWM<probability> random_motif(const vector<Example*> &data, bool reverse_weights = false) {

	const uint min = parameter->minw;
	const uint max = parameter->maxw;
	const uint W = min + rand() % (max-min+1);	// motif width
	const uint w = W/2;	// int div., width of seeded motif
	PWM<probability> ans(w);
	ans.fill(0);

	if (parameter->projection) {

		const uint window_size = std::min((uint)5, w);	// running example:  window_size = 5;

		uint INDEX = 0;	// best index
		probability COUNT = 0;	// best count

		vector<probability> counts(1 << (2*(window_size)));	// 2 bits/dna, running example:  |counts| = 1024;
		std::fill(counts.begin(), counts.end(), 0);
		vector<uint> open_windows = random_integers(w, window_size);	// running example:  open_windows = < 3 1 9 7 12 >

		for (uint s=0; s<data.size(); s++) { 
			const string *sequence = data[s]->sequence();
			const uint L = sequence->length();
			uint frame = rand()%w;	// choose a reading frame so no intra-motif patterns are included.

			for (uint i=frame; i<=(L-w); i+=w) {
				uint map = 0;
				for (uint j=0; j<open_windows.size(); j++) {
					
					const uint DNA_ID = dna_id( (*sequence)[i+open_windows[j]] );

					if (DNA_ID < DNA_ABLEN) { 
						
						// normal case:  a,c,g,t

						map |= DNA_ID << (2*j);
						// running example:  j=3, open_windows[j]=7, sequence='c', 
						//	map |= 00 01 000000

					} else { 

						// DNA is not {a,c,g,t}
						// Assume 'n', unknown base.
						// do nothing.  Don't change map.	
					}
				}
				counts[map]++;

				probability weight= data[s]->weight();
				if (reverse_weights) { weight = 1 - weight; }
			
				counts[map] += ( weight / data[s]->bg(i, i+w) );	
					// weight directly by sequence weight (i.e. 1 for positive),
					// inversely by BG likelihood 
				
				if (counts[map] > COUNT) { 
					COUNT = counts[map];
					INDEX = map;
				}
			} // next motif position
		} // next sequence

		// running example:  INDEX = 11 01 00 01 10 <=> G C A C T (note reverse order)
		
		// transfer choice over to PWM
		for (uint j=0; j<window_size; j++) { 
			
			uint base = (INDEX >> (2*j)) & 3;	// 3 is 11: 2 bits all 1's
				// running example:  j=3, INDEX>>2*j = ... 01, & MM_NT_MASK = base = 00000 01
			
			ans(open_windows[j], base) += 1;
				// running example:  ans(7, 'c') += 1;
		}

		// running example:  open_windows = < 3 1 9 7 12 >
		// running example:  PWM.mle() = nCnTnnnCnAnnGnn...
	
		ans.pseudocount();
		ans.normalize();

	} else { 
		
		// no projection
		ans.fill(1);
		vector<uint> bases = random_integers(w, w/2);
		for (vector<uint>::const_iterator i = bases.begin(); i!=bases.end(); i++) { 
			ans(*i, rand()%DNA_ABLEN) += 1;
		}
		ans.normalize();
	}

	// copy the PWM we just seeded (PPPPPPPPPP) to:  nnnnnPPPPPPPPPPnnnnn
	
	PWM<probability> padded(W);
	for (uint i=0; i<W; i++) { 
	for (uint b=0; b<DNA_ABLEN; b++) { 
		padded(i,b) = 1.0/DNA_ABLEN;
	}}

	for (uint i=0; i<w; i++) { 
	for (uint b=0; b<DNA_ABLEN; b++) {
		padded(i+(w/2),b) = ans(i, b);
	}}

	return padded;

}

void clean(CRM &crm) {

	// remove insignificant motifs

	const probability T = 0.1;


	for (uint i=0; i<crm.size(); ) {
		for (uint m=0; m<crm.site(i).multiplicity(); ) {
			
			const uint w = crm.site(i).motif(m).width();

			if (crm.site(i).motif_preference(m) < T) { 
				
				// remove it!
				crm.site(i).erase(m);
				continue;
				
			} else {

				// remove insignificant bases, add grow-room
				uint Lsite, Rsite;
				
				for (Lsite=0; Lsite<w; Lsite++) { 
					bool sig = false;
					for (uint base=0; base<DNA_ABLEN; base++) { 	
						if (crm.site(i).motif(m)(Lsite,base) < T) {
							sig = true; break;
						}
					}
					if (sig) { break; }
				}
				for (Rsite=w - 1; Rsite<w; Rsite--) { 
					bool sig = false;
					for (uint base=0; base<DNA_ABLEN; base++) { 	
						if (crm.site(i).motif(m)(Rsite,base) < T) {
							sig = true; break;
						}
					}
					if (sig) { break; }
				}
				
				if (Lsite >= Rsite || Lsite > w || Rsite > w) { 
			
					// remove this site, too
					crm.site(i).erase(m);
					continue;
					

				} else {
					
					// copy and replace PWM
					PWM<probability> pwm(Rsite - Lsite + 1 + 2);	// +2 for one on either side
					for (uint site=0; site<pwm.width(); site++) { 
						if (site==0 || site==pwm.width() - 1) { 
							// end
							for (uint base=0; base<DNA_ABLEN; base++) { pwm(site,base) = 1; }
						} else {
							// middle
							for (uint base=0; base<DNA_ABLEN; base++) { 
								pwm(site,base) = crm.site(i).motif(m)(Lsite + site - 1, base);
							}
						}
					}
					crm.site(i).replace(m, pwm);

				}
			}

			m++;
		}
		
		if (crm.site(i).multiplicity() == 0) { 
			crm.remove_site(i);
		} else {
			crm.site(i).normalize();
			i++;
		}

	}

}



vector<probability> mle(const vector<probability> &predictions,
											 probability threshold=0.5) { 
	vector<probability> ans;
	for (vector<probability>::const_iterator i = predictions.begin(); i != predictions.end(); i++) {
		ans.push_back( (*i >= threshold) ? 1 : 0 );
	}
	return ans;
}
	


vector<pair<probability,probability> > get_predictions(const CRM &crm, const vector<Example*> &data, vector<CRMLocation> *locations=NULL) {

	vector<pair<probability,probability> > ans;

	vector<probability> probabilities = probabilityof(crm, data, locations);

	for (uint x=0; x<data.size(); x++) { 
		
		ans.push_back( pair<probability,probability>(data[x]->weight() , probabilities[x]) );

	}
		

	return ans;
}

/** just to return a score */
double test(const CRM &crm, const vector<Example*> &data) { 

	Timer timer;

	vector<pair<probability,probability> > class_predictions = get_predictions(crm, data);

	double score = metric(class_predictions);

	if (parameter->verbose >= V_MAX) {

		metric(cerr);
		cerr << " = " << score 
			 << " (" << timer << ")" << endl; 
	}

	return score;
}



/** Subroutine for `learn' */
void enqueue(list<pair<CRM, double> > &Q, CRM &alt, double score) { 

	// push onto Q if there's room or score is better than somebody already in the Q

	if (Q.size() == parameter->K && Q.back().second > score) { 
		if (parameter->verbose >= V_MAX) {  cerr << "This CRM does not fit into queue." << endl; }

	} else {
	
		list<pair<CRM,double> >::iterator it;
		for (it=Q.begin(); it->second >= score && it!=Q.end(); it++);

		Q.insert(it, pair<CRM,double>(alt, score));
		if (parameter->verbose >= V_MAX) { cerr << "This CRM is now in the queue." << endl; }
		
		while (Q.size() > parameter->K) { Q.pop_back(); }
	
	}
	return;
}

				
/** Subroutine for `learn':  Compare current solution to best */
bool compare(pair<CRM,double> &solution, const CRM &alt, double alt_score) { 

	// test for the best 
	if (alt_score > solution.second) { 	

		if (parameter->verbose >= V_IM) {
			cerr << " > " << solution.second;	// << " ****";
		}

		solution = pair<CRM,double>(alt, alt_score ); // copy

		return true;

	} else { 

		return false;

	}
}




/** Subtroutine to `train':  Smooth distance distributions (depending on parameters) */
void smooth_distance(CRM &crm) {

	probability N = 0;

	if (crm.size()) { 
		for (uint i=0; i<crm.distance(0).size(); i++) { N += crm.distance(0).prob(i); }
	}

	if (N > 0) {

		// come up with a nice kernel.	

		double unit_scale_sigma = 1.0 / sqrt( (double)N );	
		if (unit_scale_sigma > parameter->ksmooth_max_stdev) {
			unit_scale_sigma = parameter->ksmooth_max_stdev;
		}

		double sigma = (CRM::MAXL/CRM::BINW) * unit_scale_sigma;
		double sigma2 = sigma*sigma;
		double C = 1 / ( sigma * sqrt(2*3.141592635589793));

		uint H = (uint)(parameter->ksmooth_stdevs * sigma);	// Half the kernel width = SGSD deviations

		if (2*H+1 > CRM::MAXL / CRM::BINW) { H = (CRM::MAXL / CRM::BINW) / 2; }

		vector<probability> kernel(2*H+1);

		for (int x = -H; x <= 0; x++) { 
			kernel[x+H] = kernel[-x+H] = C * exp(-(x*x) / sigma2);
		}

		for (uint i=0; i<crm.size(); i++) {
			crm.distance(i).ksmooth(kernel);
			for (uint j=i+1; j<crm.size(); j++) { 
				crm.distance(i,j).ksmooth(kernel);
			}	
		}

	} else { 
		
		for (uint i=0; i<crm.size(); i++) {
			crm.distance(i).fill( 1 / crm.distance(i).size() );
			for (uint j=i+1; j<crm.size(); j++) { 
				crm.distance(i,j).fill( 1 / crm.distance(i,j).size() );
			}	
		}

	}

}


/** Subroutine to `train':  Unlearn what you have learned--re-uniformize the parameters if asked to */
void reset_uniform(CRM &crm) {

	// final normalization (distance smoothing and resets for uniform user preferences)
	for (uint i=0; i<crm.size(); i++) {

		// throw all this work down the drain if I'm using uniform distributions
		if (parameter->uniform_distance) {
			crm.distance(i).fill(1);
			crm.distance(i).normalize();
			for (uint j=i+1; j<crm.size(); j++) { 
				crm.distance(i,j).fill(1);
				crm.distance(i,j).normalize();
			}
		}

		if (parameter->uniform_strand) { 
			  crm.site(i).strand_preference(BindingSite::TMPL)
			= crm.site(i).strand_preference(BindingSite::TCX)
			= ((probability)(1.0 / BindingSite::NUM_STRANDS));
		}

		if (parameter->uniform_order) { 
			for (uint j=0; j<crm.size(); j++) { 
				crm.order(i,j) = 1;	// it's okay to update order(x,x), no effect
			}
		}

		if (parameter->uniform_motif) { 
			for (uint m=0; m<crm.site(i).multiplicity(); m++) { 
				crm.site(i).motif_preference(m) = ( 1.0 / crm.site(i).multiplicity() );
			}
		}
	}

}


/** Subroutine:  Replace the first BindingSite objects with those of 'base' */
void reset_base(CRM &crm, const CRM &base) { 

	for (uint i=0; i<base.size(); i++) { 
		crm.site(i) = base.site(i);	// copy
		crm.distance(i) = base.distance(i);	//copy
		for (uint j=0; j<base.size(); j++) { 
			crm.distance(i,j) = base.distance(i,j);	// copy
			crm.order(i,j) = base.order(i,j);
		}
	}
	return;

} 


/** Decide whether the given CRM subset (of binding sites) is the positive version of the given CRM.
	This is the case iff the subset has all and only the non-negated sites of the given CRM */
/*

MOVED TO CRM.h

bool is_positive_path(const CRM &subset, const CRM &crm) { 
	return ( subset.negated() == 0 && subset.size() == crm.size() - crm.negated() );
}
*/

/**
 *	Subroutine for `learn':  Train a model with EM 
 *
 *	crm is CRM to train
 *	base is base of CRM (all CRMs have base as submodel, may have to replace learned params with base)
 *	data is (positive/negative weighted data)
 *
 */
void train(CRM &crm, const CRM &base, const vector<Example*> &data) {

	Timer timer;
	if (parameter->verbose >= V_MAX) { 
		cerr << "Train CRM on " << data.size() << " sequences..." << endl; 
	}

	const uint ITERATIONS = (crm.size()==1 ? parameter->emi1 : parameter->emi2);

	constrain_preconditions(crm, data);	// before running algo, make sure crm is allowed by customization

	for (uint krogh_it=0; krogh_it<ITERATIONS; krogh_it++) {

		if (parameter->verbose >= V_MAX) { 
			cerr << "EM iteration " << (krogh_it+1) 
				 << " of " << ITERATIONS
				 << " "; 
		}

		Timer em_timer;

		// DO DISCRIMINATIVE OR GENERATIVE ALGORITHM, BUT STORE RESULT IN `crm'

		if (parameter->generative) { 

			// GENERATIVE ALGORITHM

			#if CRM_DEBUG
			assert(parameter->G == 0);
			#endif
			
			CRM correct;
			EM(crm, data, NULL, &correct, !crm.negated());	// (this means max. l'hood of negative examples if CRM is negated)
			crm = correct;		// jus' copy it in generative mode	
			crm.normalize();
			smooth_distance(crm);		
			crm.pseudocount(parameter->pseudocount);
			crm.normalize();

		} else {

			// DISCRIMINATIVE ALGORITHM

			// Train the model, remember:  CRM_new <-- CRM_old + eta( CRM_correct - CRM_used )

			vector<CRM> powerset = crm.powerset();
			vector<CRM> used(powerset.size());		// unaltered expected parameters used in paths
			vector<CRM> correct(powerset.size());	// normalized over all *appropriate* sequences

			vector<probability> path_lhood(powerset.size());		// sequence likelihood given each path
			
			probability hmm_lhood = 0;	// sum of sequence over all paths
			probability pos_lhood = 0;	// likelihood over paths which are correct for positive examples
			probability neg_lhood = 0;

			#if CRM_DEBUG
			uint positive_path_found = 0;	// debugging only: count number of "positive paths"
			#endif

			for (uint p=0; p<powerset.size(); p++) { 

				// test if this subset is the positive path (only and all non-negated binding sites)
				bool p_pos_path = CRM::is_positive_path(powerset[p], crm);

				// If we're making the "two worlds" assumption, do not compute
				//	probabilities for paths not under consideration.
				if (parameter->twoworlds && p!=0 && p!=powerset.size()-1 && !p_pos_path) { continue; }
				
				#if CRM_DEBUG
				if (p_pos_path) { assert(positive_path_found++ == 0); }	// exactly 1 pos. path
				#endif

				hmm_lhood += path_lhood[p] = EM(powerset[p], data, &(used[p]), &(correct[p]), p_pos_path);

				smooth_distance(used[p]);		// smooth before add, subtract
				smooth_distance(correct[p]);	// 
			
				if (p_pos_path) { pos_lhood += path_lhood[p]; }	
				else            { neg_lhood += path_lhood[p]; }	// total l'hood in neg. paths (need to distribute later)

			} // next CRM in powerset

			
			#if CRM_DEBUG
			assert(positive_path_found == 1);	// assert that positive path was found
			#endif

			// `eta' is the learning rate.  Start with user-given value.  The
			// update is:
			//
			//		CRM_time=t+1  <-  CRM_time=t + eta *
			//		(expected-parameters-used-in-CORRECT-paths -
			//		expected-parameters-used-in-ALL-paths)
			//
			// IF any parameters fall below zero because of the subtraction,
			// we'll decrease the value of `eta' (by dividing by two) and try
			// again.  Forever.  Until the subtraction does not fail.
			//
			// Note:  A better way would be to record how much below zero the
			// parameters fall, calculate the maximum `eta' (minus epsilon
			// perhaps), and recalculate CRM_time=t+1 without fear.  In
			// practice however, the parameters never fall below zero, so this
			// would just be extra work.
			probability eta = parameter->eta;
			
			CRM backup = crm;	// backup copy in case parameters fall below zero
			bool subzero = true;
			while (subzero) {

				subzero = false;

				for (uint p=0; p<powerset.size(); p++) {
				
					bool p_pos_path = CRM::is_positive_path(powerset[p], crm);
				
					// ignore certain paths under the "two worlds" assumption
					if (parameter->twoworlds && p!=0 && p!=powerset.size()-1 && !p_pos_path) { continue; }

					// add parameters used when this path is correct (to extent it's the chosen correct path)
					if (p_pos_path) { 
						crm.add(correct[p], eta * (path_lhood[p] / pos_lhood), p);
							// NOTE:  (path_lhood[p] / pos_lhood) should be 1.0--there's only one correct positive path
					} else {
						crm.add(correct[p], eta * (path_lhood[p] / neg_lhood), p);
							// NOTE:  This doesn't change `crm' when p = 0, but the empty CRM 
							//	does count for some of the neg_lhood.
					}

					// subtract parameters used in this path
					subzero &= crm.subtract(used[p], eta * (path_lhood[p] / hmm_lhood), p);	// hmm_lhood is sum over p of path_lhood[p]

					if (subzero) { break; }

				}

				if (subzero) { 
					eta /= 2.0;
					crm = backup;	// restore backup copy
					#if CRM_DEBUG
					static uint erc = 0;
					cerr << "Warning (CRM debug mode):  During CRM update, parameters fell below zero (" << (++erc) << ").  "
						 << "Resetting eta = " << eta << endl;
					#endif
				}

			} // (while !subzero)

		} // if generative/discriminative

		crm.normalize();

		if (parameter->pseudocount > 0) { 
			crm.pseudocount(parameter->pseudocount);
			crm.normalize(); 
		}

		// Call subroutine to re-uniform certain parameters (if user said to do so)
		reset_uniform(crm);

		// Call subroutine to reset submodel to a "base" submodel (if base is to remain fixed)
		if (parameter->base_fixed) { reset_base(crm, base); }
	
		// make sure it's normal after custom code (better safe than sorry?)
		crm.normalize();

		// verbose...
		if (parameter->verbose >= V_MAX) {
		
			cerr << " (" << em_timer << ")" << "\t";

			crm.summarize(cerr);
			
			if (data.size()) { 
			
				// this actually involves unnecessary calls to `inside' (via `probabilityof')
				//	to see how probability on some training sequences is changing
			
				vector<Example*> first_and_last;
				first_and_last.push_back(data[0]);
				first_and_last.push_back(data[data.size()-1]);
				vector<probability> p_first_and_last = probabilityof(crm, first_and_last);
		
				cerr << "\tP(seq#1|CRM)=" << p_first_and_last[0] << " "
					 << "\tP(seq#" << (data.size()) << "|CRM)=" << p_first_and_last[1] << endl; 
			}
		}

		// constrain the CRM model if necessary
		constrain_per_iteration(crm, data);

	} // next EM iteration
		
	// constrain the CRM model if necessary
	constrain_finalize(crm, data);

	if (parameter->verbose >= V_MAX) { cerr << "Done training (" << timer << ")" << endl; }
	if (parameter->verbose >= V_MAX) { cerr << "Results of training:" << endl << crm << endl; }

	return;
}


/** Subroutine of `learn':  Consider an alternative solution (train/test/compare-to-best/enqueue) */
void consider(CRM &alt, 
			  const CRM &base,
			  const vector<Example*> &data,
			  const vector<Example*> &evaluation_data,	
			  pair<CRM,double> &solution,
			  list<pair<CRM,double> > &Q,
			  int number) {

	if (parameter->verbose >= V_MAX) {
		if (number >= 0) { cerr << "(" << number << ") "; }	// number of models considered
		cerr << "Consider a new model:" << endl << alt << endl;
	}

	double score;
	Timer timer;

	// TRAIN THE MODEL (also calls reset_base() and custom constraints functions)

	if (!(alt.size() == base.size() && parameter->base_fixed)) { // don't train if param's won't change
		train(alt, base, data);
	}

	// EVALUATE ON TRAINING DATA
	score = test(alt, evaluation_data);

	if (parameter->verbose >= V_IM) { 
		
		// number...CRM_SUMMARY...Fx=SCORE[ > SCORE] (elapsed time)\n
		
		cerr << number << "\t";
		alt.summarize(cerr);
		cerr << "\t";
		metric(cerr);
		cerr << " = " << score;
	}
		
	compare(solution, alt, score);		// may print "****" or something for new best solutions

	if (parameter->verbose >= V_IM) { cerr << " (" << timer << ")" << endl; }

	enqueue(Q, alt, score);	// this will either steal or delete alt!

	return;
} // compare




CRM learn(const CRM &base, 	// 'base' CRM model (must start with this submodel)
		  const vector<Example*> &data,
		  const vector<Example*> &evaluation_data) {

	Timer timer;

	uint cmc = 0;	// CRM models considered

	// weight sequences such that both classes are equally represented and they sum to one
	probability positives = 0;
	for (vector<Example*>::const_iterator i=data.begin(); i!=data.end(); i++) {
		positives += (*i)->weight(); 
	}

	list<pair<CRM,double> > Q;	// the queue
	pair<CRM, double> solution; solution.second = 0;




	// Start with user-defined candidate motifs (single-motif CRMs)
	if (parameter->cm_file != "") {
		if (parameter->verbose >= V_MAX) { cerr << "Reading candidate motif FASTA file, \"" << parameter->cm_file << "\"..." << endl; }
		ifstream fin(parameter->cm_file.c_str());
		if (!fin) { cerr << "Cannot open candidate motif FASTA file, \"" << parameter->cm_file << "\"" << endl;  exit(-1); }
		uint num_candidates = 0;
		// NOTE:  Do not test cmc < crmlim if these don't count... while (cmc < parameter->crmlim && fin.good())
		while (fin.good()) {
			string header, motif;
			uint fasta_code = fasta_read(fin, header, motif);
			if (fasta_code != fasta_success) { 
				cerr << "FASTA read error (" << fasta_message[fasta_code] 
				     << " in \"" << parameter->cm_file << "\")." << endl;
				exit(-1); 
			}
			PWM<probability> pwm(motif.length());
			pwm.add(motif);
			pwm.pseudocount(parameter->pseudocount);
			CRM alt = base;		// always start with 'base'
			alt.add_site(pwm);
			alt.normalize();

			// test, but do not train candidate motif (does not count as a considered model)
			double score = test(alt, evaluation_data);
			if (parameter->verbose >= V_IM) { 
				// c##  (sAGCTnnTTTT) & (nnTCAAGT)  F1 = 0.937 > 0.822 EOL
				cerr << "c" << ++num_candidates << "\t";
				alt.summarize(cerr);
				cerr << "\t" << metric() << " = " << score;
			}
			compare(solution, alt, score);
			if (parameter->verbose >= V_MAX) { cerr << endl; }	// noto, 2006-07-13, ad-hoc fix
			enqueue(Q, alt, score);
			cerr << endl;
			
		}
	}


	// Try learning from base itself (2006-07-17)
	if (base.size()) {
		// if (! parameter->base_fixed) 
		{
		 	// NOTE:  could consider base only if parameters are not fixed, 
			//	but I think a score for the base model is good to include.
			//	The question is, should I count this as one explored model? 
			//	Maybe only if base is not fixed?
			
			CRM alt = base;	// copy

			if (parameter->base_fixed) { 
				consider(alt, base, data, evaluation_data, solution, Q, cmc);	
			} else {
				consider(alt, base, data, evaluation_data, solution, Q, ++cmc);
			}
		}
	}


	// Seed Queue with 'Kseeds' random motifs 
	for (uint i=0; cmc < parameter->crmlim && i<parameter->Kseeds; i++) { 
		if (parameter->verbose >= V_MAX) { 
			cerr << "Seeding the queue (" << timer << " elapsed), motif " << (i+1) 
				 << " of " << parameter->Kseeds << "..." << endl; 
		}
		CRM alt = base;	// start with base, actually
		alt.add_site(random_motif(data));
		consider(alt, base, data, evaluation_data, solution, Q, ++cmc);	// this will add to the queue
	}

	// Loop until timeout
	while (cmc < parameter->crmlim) {	// main Q loop
	
		// for verbose mode, write state of Q
		if (parameter->verbose >= V_MAX) {
			cerr << "Queue (" << timer << " elapsed):  ";
			if (Q.empty()) { cerr << "(empty)"; }
			cerr << endl;
			uint index = 0;
			for (list<pair<CRM,double> >::const_iterator it=Q.begin(); it!=Q.end(); it++) { 
				cerr << ++index << "\t" << (it->second) << "\t";
				(it->first).summarize(cerr);
				cerr << endl;
			}
		}

		if (Q.empty()) { 
			// add one motif to make sure Queue is nonempty
			if (parameter->verbose >= V_MAX) { cerr << "Reseed empty Queue (" << timer << " elapsed)" << endl; }
			CRM alt = base;
			alt.add_site(random_motif(data));
			consider(alt, base, data, evaluation_data, solution, Q, ++cmc);
		}

		#if CRM_DEBUG
		assert(!Q.empty());
		#endif

		const CRM theory = Q.front().first;	// copy
		Q.pop_front();	// remove from further consideration


		for (uint theory_itr=0; cmc < parameter->crmlim && theory_itr<parameter->theory_iterations; theory_itr++) {

			//  Operate!

			// OP:  add disjunct motif to last site to best working theory
			if (theory.site(theory.size()-1).multiplicity() < parameter->D) {

				// special consideration:  if you're adding a disjunct to the
				// base, skip this step (i.e. only add disjuncts to *new*
				// conjuncts).  NOTE:  To allow addition of disjunct motifs to
				// last conjunct binding site of the base model, we'll have to
				// update the `reset_base' function to re-adjust only the
				// base-disjuncts of the last BindingSite.  It currently just
				// overwrites the first base.size BindingSite objects.
				
				if (theory.size() > base.size()) { 
					CRM alt = theory;
					alt.add_motif(theory.size()-1, random_motif(data));
					consider(alt, base, data, evaluation_data, solution, Q, ++cmc);
					if (cmc >= parameter->crmlim) { break; }

				}
			}

			// OP:  add un-negated site to best working theory
			if (theory.size() < parameter->C) {
				CRM alt = theory;
				alt.add_site(random_motif(data));
				consider(alt, base, data, evaluation_data, solution, Q, ++cmc);
				if (cmc >= parameter->crmlim) { break; }
			}

			// OP:  add negated Site
			if (theory.negated() < parameter->G && theory.size() < parameter->C) {
				CRM alt = theory; // copy
				alt.add_site(random_motif(data, true));
				alt.site(alt.size()-1).negated = true;
				consider(alt, base, data, evaluation_data, solution, Q, ++cmc);
				if (cmc >= parameter->crmlim) { break; }
			}
			

		} // next iteration with this theory

	} // main loop

	Q.clear();
	return solution.first;
}

/** Subroutine:  Print data set sizes */
void datastats(ostream &out, const vector<vector<Example*> > &data) { 

	#if CRM_DEBUG
	assert(data.size() == NUM_SETS);	// assumption of this subroutine
	#endif

	out << endl << "Data statistics:" << endl;

	for (uint t=0; t<NUM_SETS; t++) { 

		out << (t==TRAIN ? "TRAINSET:  " : t==TUNE ? "TUNESET:   " : "TESTSET:   ");

		double P=0, N=0;
		for (vector<Example*>::const_iterator i=data[t].begin(); i!=data[t].end(); i++) { 
			P += (*i)->weight();
			N += 1 - (*i)->weight();
		}

		out << data[t].size() << " sequences (" << P << "+ " << N << "- )" << endl;

	}
	out << endl;
	return;
}

int main(int argc, char **argv) {

	GlobalParameters parser(PROGRAM_VERSION);
	parameter = &(parser);
	vector<string> args(parser.parse(argc, argv, "Postive-FASTA-input Negative-FASTA-input"));

	// Check for legal input (complain and print how-to-get-help message.)
	if (args.size()==0) {
		cerr << argv[0] << ":  Zero input sequences." << endl;
		cerr << "(`" << argv[0] << " --help' for help.)" << endl;
		parser.usage(cerr);
		exit(-1);
	}

	if (parameter->folds == 0) { 
		cerr << "Illegal value:  folds = " << parameter->folds << endl; 
		cerr << "(`" << argv[0] << " --help' for help.)" << endl;
		exit(-1); 
	}

	if (parameter->minw > parameter->maxw) { 
		cerr << "Minimum motif width (" << parameter->minw 
			 << ") is larger than maximum (" << parameter->maxw
			 << ")." << endl;
		cerr << "(`" << argv[0] << " --help' for help.)" << endl;
		exit(-1);
	}
	
	if (parameter->minw < 2) { 
		cerr << "Minimum motif width is too small (" << (parameter->minw) << ")." << endl;
		cerr << "(`" << argv[0] << " --help' for help.)" << endl;
		exit(-1);
	}

	if (parameter->G > 0 && parameter->generative) { 
		cerr << "Cannot learn negated binding sites in generative mode (set -G=0)" << endl;
		exit(-1);
	}


	// Echo parameters
	cerr << endl << PROGRAM_VERSION << endl;
	cerr << "Inupt file(s):  ";
	for (vector<string>::const_iterator i=args.begin(); i!=args.end(); i++) { cerr << "\"" << *i << "\" "; }
	cerr << endl;
	cerr << "Parameter settings:  ";
	parameter->dump(cerr, "=", " ", true);	// true <=> terse dump, only use option flags
	cerr << endl;
	cerr << endl;

	// Seed PRNG
	if (parameter->seed < 0) { 	
		uint seed = time(NULL);
		cerr << "Seeding PRNG with " << seed << endl;	// always know your seed
		srand(seed);
	} else {
		srand((uint)(parameter->seed));
	}

	// READ and ORGANIZE Sequence data
	//	NOTE:  `read_data' will read, use, discard the background model 
	vector<IOExample*> ADP;	// allocated data (I will need to free this)
	vector<vector<Example*> > data(NUM_SETS);
	read_data(args, data, ADP);
	
	// Set the maximum distance to the length of the longest sequence
	CRM::MAXL = 0;
	for (uint t=0; t<NUM_SETS; t++) { 
	for (uint s=0; s<data[t].size(); s++) { 
		if (data[t][s]->sequence()->length() >  CRM::MAXL) { 
			CRM::MAXL = data[t][s]->sequence()->length();
		}
	}}
	if (parameter->uniform_distance) {
		if (CRM::BINW != CRM::MAXL && parameter->verbose >= V_IM) { 
			cerr << "Resetting distance histogram bin width = " << CRM::MAXL << endl;
		}
		CRM::BINW = CRM::MAXL;	// no need to make this slow with smoothing
	}

	Timer timer;
	CRM solution;

	// Read "base" CRM Model
	CRM base;	// default:  empty CRM
	if (parameter->crm_base_filename != "") { 
		Timer base_timer;
		if (parameter->verbose >= V_MAX) { cerr << "Reading base model file \"" << parameter->crm_base_filename << "\"...  " << endl; }
		ifstream base_in(parameter->crm_base_filename.c_str());
		if (!base_in) { cerr << "ERROR:  Cannot open base CRM model file \"" << parameter->crm_base_filename << "\"." << endl;  exit(-1); }
		bool read_success = base.read(base_in);
		if (!read_success) { cerr << "Error reading CRM model file \"" << parameter->crm_base_filename << "\"." << endl;  exit(-1); }
		base.normalize();	// 2006-07-12, user-defined CRM may not be normalized
		// base.pseudocount(parameter->pseudocount);	// could do this to avoid zeros in the model
		base.normalize();	
		if (parameter->verbose >= V_IM) { cerr << "Read base CRM model from \"" << parameter->crm_base_filename << "\" (" << base_timer << "):" << endl; }
		if (parameter->verbose >= V_MAX) { cerr << endl << "Base model:" << endl << base << endl << endl; }
	}


	// print data statistics
	if (parameter->verbose >= V_IM) { datastats(cerr, data); }	



	if (parameter->crm_load_filename == "") { 

		// LEARN A REPRESENTATION -- GO FOR IT!

		solution = learn(base, data[TRAIN], data[TUNE].size() ? data[TUNE] : data[TRAIN]);
		cerr << "Learned CRM (" << timer << "):";

	} else {

		// Read from file
		ifstream crm_in(parameter->crm_load_filename.c_str());
		if (!crm_in){ 
			cerr << "ERROR:  Could not open CRM input file, \"" << parameter->crm_load_filename << "\"" << endl;
			exit(-1);
		}
		bool read_success = solution.read(crm_in);
		crm_in.close();
		if (!read_success) { 
			cerr << "Error reading CRM from \"" << parameter->crm_load_filename << "\".  "
				 << "Check file version (expected version is " << CRM::file_version() << ").  "
				 << "Exiting." << endl;
			exit(-1);
		}
		solution.normalize();	// 2006-07-12, user-defined CRM may not be normalized
		cerr << "Read CRM from \"" << parameter->crm_load_filename << "\" (" << timer << "):";
	}

	// PRINT MODEL (to screen)
	if (solution.size()) { 
		cerr << endl << solution << endl;
	} else {
		cerr << "  (Learned solution is an empty CRM.)" << endl;
	}

	// CRM FILE
	if (parameter->crm_save_filename != "") { 
		ofstream crm_out(parameter->crm_save_filename.c_str());
		if (!crm_out) { cerr << "Can't open CRM file \"" << parameter->crm_save_filename << "\" for writing." << endl; }
		else { 
			solution.write(crm_out);
			cerr << "Wrote CRM to \"" << parameter->crm_save_filename << "\"." << endl;
			crm_out.close();
		}
	}
	
	
	// DISTANCE FILE
	if (parameter->distance_file != "") { 
		ofstream dfout(parameter->distance_file.c_str());
		if (!dfout) { cerr << "Can't open distance file \"" << parameter->distance_file << "\" for writing." << endl; }
		else { 
			solution.dump_distance(dfout);
			dfout.close();
			if (parameter->verbose >= V_IM) {
				cerr << "Wrote distances to \"" 
				     << parameter->distance_file << "\"." 
					 << endl; 
			}
		}
	}


	// EVALUATE MODEL ON DATA
	for (uint t=0; t<NUM_SETS; t++) {
	
		string set = t==TRAIN ? "Trainset" :
		             t==TUNE  ? "Tuneset" :
		                        "Testset";

		cerr << endl << " --- " << set << " --- " << endl << endl;	

		if (data[t].size()) {

			vector<CRMLocation> locations;

			vector<pair<probability,probability> > class_predictions = get_predictions(solution, data[t], &locations);

			// output
			
			for (uint s=0; s<data[t].size(); s++) { 
				cout << data[t][s]->name() << "\t"
					 << (t==TRAIN?"train":(t==TUNE?"tune":(t==TEST?"test":"?"))) << "\t"
					 << data[t][s]->weight() << "\t"
					 << class_predictions[s].second << "\t"
					 << locations[s] << "\t"
					 << endl;
			}

			// summary statistics, classification grid and metric to stderr 	
			cerr << endl;
			print_classification_grid(class_predictions, cerr);
		 	cerr << endl << set << " " << metric() << " = " << metric(class_predictions) << endl << endl;
			
		} else {
			cerr << "(empty.)" << endl;
		}

	}

	// Clean up
	parameter = NULL;
	for (uint x=0; x<ADP.size(); x++) { delete (ADP[x]); }	// free all data
	
	return 0;
}
	

	
	
