const char PROGRAM_VERSION[] = "Noto and Craven, CRM 1.0";
const char PROGRAM________[] = "------------------------";

/*

	2004/12/16

	2005/01/07

	2005/04/15	Changed distance constraint method:
					for each pair of branches, add the tree with a new BEST change to the current 
					distance constraint (do not add multiple trees at one stage).  This is found
					by trying the MIDPOINT between all relevant distances

	2005/04/16	Adding TUNE SET methods 

	2005/04/20	Changed Chi squared significance parameters slightly, now require a table value as user input parameter

	2005/04/20	Added Chi squared parameter search 

	2005/11/18	Removed multiple binding sites from aspect search				
						
*/

#include <vector>
#include <list>
#include <map>
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <string>
#include <sstream>
#include <math.h>
#include <assert.h>
using namespace std;

// definitions shared by this and the files below
#include "definitions.h"

// main working data structures
#include "Branch.h"
#include "Tree.h"
#include "Results.h"
#include "Hypothesis.h"

#include "Option.h"
#include "fasta.h"
#include "WSS.h"

#if DEBUG_MALLOC_CHECK
void malloc_check_stats(ostream &out) {
	out << "DEBUG_MALLOC_CHECK:" << endl;
	out << "Hypothesis:  " << hypothesis_alloc << "+ " << hypothesis_free << "~ " 
		<< "\tbalance:  " << (hypothesis_alloc - hypothesis_free) << endl;
	out << "Tree:  " << tree_alloc << "+ " << tree_free << "~ " 
		<< "\tbalance:  " << (tree_alloc - tree_free) << endl;
	out << "Branch:  " << branch_alloc << "+ " << branch_free << "~ " 
		<< "\tbalance:  " << (branch_alloc - branch_free) << endl;
	out << "Results:  " << results_alloc << "+ " << results_free << "~ " 
		<< "\tbalance:  " << (results_alloc - results_free) << endl;
}		 
#endif



ostream& operator<<(ostream &out, const Branch &b) {
	out << "(";
	for (list<int>::const_iterator m=b.motifs.begin(); m!=b.motifs.end(); m++) { 
		if (m!=b.motifs.begin()) { out << " "; }
		out << *m;
	}
	out << ")" << STRAND_STR[b.strand];	
	return out;
}


		
ostream& operator<<(ostream &out, const Tree &tree) {
	
	out << "[";
	for (vector<Branch*>::const_iterator b=tree.branches.begin(); b!=tree.branches.end(); b++) {
		if (b != tree.branches.begin()) { out << ","; }
		out << **b;
	}
	out << "]";

	if (!tree.negup.empty()) { 
		out << " neg. up = {";
		for (list<int>::const_iterator nu=tree.negup.begin(); nu!=tree.negup.end(); nu++) { 
			if (nu != tree.negup.begin()) { out << ", "; }
			out << *nu; 
		}
		out << "} ";
	}

	if (tree.tss != INFINITY) { out << " tss = " << tree.tss << " "; }
	
	if (!tree.negdown.empty()) { 
		out << " neg. down = {";
		for (list<int>::const_iterator nu=tree.negdown.begin(); nu!=tree.negdown.end(); nu++) { 
			if (nu != tree.negdown.begin()) { out << ", "; }
			out << *nu; 
		}
		out << "} ";
	}


	for (int b=0; b<tree.size(); b++) { 
	for (int b2=b+1; b2<tree.size(); b2++) { 
		if (tree.constraints[b][b2].oc == UPSTREAM_OF)   { out << "(" << b << " < " << b2 << ") "; }
		if (tree.constraints[b][b2].oc == DOWNSTREAM_OF) { out << "(" << b2 << " < " << b << ") "; }
		//if (tree.constraints[b][b2].oc == EITHER_ORDER)  { out << "(" << b << " <> " << b2 << ") "; }
			// need this?  either order is 'default'
	}}

	for (int b=0; b<tree.size(); b++) { 
	for (int b2=b+1; b2<tree.size(); b2++) { 
		if (tree.constraints[b][b2].dc != INFINITY)
			{ out << "(" << b << "," << b2 << " within " << tree.constraints[b][b2].dc << ") "; }
	}}
	
	for (int b=0; b<tree.size(); b++) { 
	for (int b2=b+1; b2<tree.size(); b2++) { 
		if (!tree.constraints[b][b2].negated.empty()) { 
			out << "(" << b << "," << b2 << " negated: {";
			for (list<int>::const_iterator n = tree.constraints[b][b2].negated.begin();
			                               n!= tree.constraints[b][b2].negated.end(); n++) { 
				if (n != tree.constraints[b][b2].negated.begin()) { out << ", "; }
				out << *n;
			}
			out << "} ";
		}
	}}
	
	return out;
}


ostream& operator<<(ostream &out, const Results &results) { 
	out << "TP=" << results.TP << " "
		<< "FP=" << results.FP << " "
		<< "TN=" << results.TN << " "
		<< "FN=" << results.FN << " "
		<< "P=" << results.precision << " "
		<< "R=" << results.recall << " " 
		<< "S=" << results.score;
	return out;
}



ostream& operator<<(ostream &out, const Hypothesis &h) { 
	out << *(h.tree) << ", " << *(h.results);
	return out;
}



ostream& operator<<(ostream &out, const MotifInstance &MI) { 
	out << "sid=" << MI.sid << ", "
	    << "mid=" << MI.mid << ", "
	    << "pos=" << MI.pos << ", "
	    << "strand=" << MI.strand;
	return out;
}



/**
 * Given a list of motif instances, create a map, SMMap[sequence][motif] -> list of (position, strand) pairs
 */
template <typename _MotifInstanceIterator>
SMMap create_smmap(_MotifInstanceIterator begin, _MotifInstanceIterator end, int S, int M) {

	SMMap ans(S);

	for (int s=0; s<S; s++) { ans[s].resize(M); }

	for (_MotifInstanceIterator i=begin; i!=end; i++) { 

		ans[i->sid][i->mid].push_back(pair<int,int>(i->pos,i->strand));
	}
	
	return ans;
}



/** See Results.h struct Results */


/** See Branch.h struct Branch */


/** See Tree.h struct Constraint, struct Tree, ... */


/** See Hypothesis.h struct Hypothesis */



/** 
 * Queue structure keeps a queue of POINTERS to objects.
 *	Sorted order is maintained (via the add() function, which takes a comparor)
 *	The Queue is limited to MAX elements (additional elements will be DELETED)
 *	All elements in the queue will be DELETED when the Queue is deleted
 *	the pop() function will return the first element, which must be deleted by the caller.
 */
template <typename T>
struct Queue {

  private:
  	int MAX;
	int sz;
	list<T*> Q;

  public:

  	Queue<T>(int max) : MAX(max), sz(0) {  }

	~Queue<T>() {
		// delete all remaining elements
		for (typename list<T*>::iterator i=Q.begin(); i!=Q.end(); i++) { delete (*i); }
		Q.clear();
	}
 
 	template <typename _Comparor>
	void add(T *t, const _Comparor *compare) {

		for (typename list<T*>::iterator i=Q.begin(); i!=Q.end(); i++) {
			if (*t == **i) { 
				delete t;
				return;
			} else if ((*compare)(t, *i)) {
				Q.insert(i, t);
				sz++;
				while (sz > MAX) { 
					T *back = Q.back();
					Q.pop_back();
					delete back;
					sz--;
				}
				return;
			}
		}

		// if a spot was found, element would have been inserted
		//	and function would have returned already.
		if (sz < MAX) {
			Q.push_back(t); sz++; 
		} else {
			delete t;
		}
		return;
	}

	const T* peek() { return Q.front(); }

	inline typename list<T*>::const_iterator begin() { return Q.begin(); }
	inline typename list<T*>::const_iterator end() { return Q.end(); }

	T* pop() { 
		T *ans = Q.front();
		Q.pop_front();
		sz--;
		return ans;
	}

	inline int max() { return MAX; }
	inline int size() { return sz; }
	inline bool empty() { return Q.empty(); }
	
};


struct HypScoreCompare { 
  public:
	bool operator()(const Hypothesis *LHS, const Hypothesis *RHS) const { 
		// return true iff LHS better than RHS
		double Lscore = LHS->results->score;
		double Rscore = RHS->results->score;
		if (isnan(Lscore)) { return false; }
		if (isnan(Rscore)) { return true; }
		return (Lscore > Rscore);
	}
};

struct TreeHash {
  private:
	vector<list<Tree*> > table;
	int hash(const Tree *tree);
	void delete_all();
  public:
	TreeHash(int size) { table.resize(size); }
	~TreeHash() { delete_all(); }
  	bool insert(const Tree *tree);
};




int TreeHash::hash(const Tree *tree) {

	unsigned long sum = 0;

	for (int b1=0; b1<tree->size(); b1++) {
		for (list<int>::const_iterator m=tree->motif_begin(b1); m!=tree->motif_end(b1); m++) {
			sum += *m;
		}
		for (int b2=b1+1; b2<tree->size(); b2++) { 

			sum += tree->order_constraint(b1,b2);
			if (tree->distance_constraint(b1,b2) < INFINITY) { sum += ( ((int)(tree->distance_constraint(b1,b2))) + 1); }
			for (list<int>::const_iterator n=tree->negated_begin(b1,b2); n!=tree->negated_end(b1,b2); n++) { sum += *n; }
		}
		sum += (tree->strand(b1) + 1);
	}
	
	if (tree->distance_constraint() < INFINITY) { sum += ((int)(tree->distance_constraint())); }
	for (list<int>::const_iterator n=tree->negated_begin(true); n!=tree->negated_end(true); n++) { sum += *n; }
	for (list<int>::const_iterator n=tree->negated_begin(false); n!=tree->negated_end(false); n++) { sum += *n; }

	unsigned long hash = sum * (*(tree->motif_begin(0)) + 1);	// mult. by first motif ID (+1 b/c starts at 0)

	return abs((int)(hash % this->table.size()));	// abs for any overflow
}
	
bool TreeHash::insert(const Tree *tree) {

	int index = hash(tree);

	list<Tree*>::iterator i;
	for (i = table[index].begin(); i != table[index].end(); i++) { 
		if ( **i == *tree ) {
			return false; 
		}
	}

	Tree *copy = new Tree(*tree);
	table[index].insert(i, copy);

	//DEBUGGING
	//int count = 0;
	//for (int i=0; i<table.size(); i++) { count += table[i].size(); }
	//cerr << "TreeHash::insert:  (" << count << " trees) inserted at hash index " << index << ", tree " << *tree << endl;
		
	return true;
}

void TreeHash::delete_all() {

	for (int index=0; index<table.size(); index++) { 	
		for (list<Tree*>::iterator i=table[index].begin(); i!=table[index].end(); i++) { 
			delete (*i);	// delete tree pointer in list
		}
	}
}



	



/** Convert a printable type to a string */
template <typename _PRINTABLE>
string tostring(_PRINTABLE p) {
	ostringstream oss;
	oss << p;
	return oss.str();
}

bool replace(string &src, const string &out, const string &in) {

	bool ans;
	int index = src.find(out);
	if (index >= 0) { 
		src.replace(src.begin()+index, src.begin()+index+out.length(), in);
		ans = true;
	} else {
		ans = false;
	}
	return ans;
}
	
/** compare two PAIRS of things by the item that appears FIRST */
template <typename _Pair>
struct ComparePairByFirstDesc { 
  public:
  	bool operator()(const _Pair &LHS, const _Pair &RHS) {
		return (LHS.first > RHS.first);
	}
};

/** 
 *	Find the optimal distance to use
 *
 *	Given:  a list of (distance, sequenceID) pairs, where ``distance'' is a distance such
 *			that, if a constraint were any LOWER than it, the sequence would no longer qualify.
 *			these are assumed to have been ALREADY SORTED BY DISTANCE, DESCENDING FROM THE LARGEST.
 *
 *			A ``default'' set of results and a distance (the unconstrained model on which this is based)
 *
 *			``P'', the sequence ID such than any sequence ID strictly LOWER than it are considered 
 *			positive examples.
 *
 *	return:  the optimal distance to use (best new distance) as a reference parameter
 *
 */
template <typename _Iterator>
double optimize_distance(_Iterator begin, _Iterator end, const Results &dr, const double dd, int P) {

	Results *current_results = new Results(dr);
	Results *best_results = new Results(dr);
	double best_distance = dd;

	_Iterator i1 = begin;
	_Iterator i2 = begin;
	i2++;

	while (i1 != end && i2 != end) {

		int p1 = i1->first; int s1 = i1->second;
		int p2 = i2->first; int s2 = i2->second;

		int POS = (s1 < P) ? 1 : 0;
		int NEG = (s1 < P) ? 0 : 1;
		Results *new_current_results = new Results(current_results->TP-POS, current_results->FP-NEG, 
		                                           current_results->TN+NEG, current_results->FN+POS);
		delete current_results;
		current_results = new_current_results;

		if (p1 != p2) { 
			double midpoint = (p1 + ((p2-p1)/2.0));

			if (   current_results->score >= best_results->score
				|| best_results->score == dr.score) { 	// HERE:  insist that optimal results be different, i.e. the model has to change somehow
				delete best_results;
				best_results = new Results(*current_results);
				best_distance = midpoint;
			}
		}
		i2++;
		i1++;
	}

	delete current_results;
	delete best_results;
	return best_distance;
	
}


/** 
 *	Calculate a CHI SQUARED value for two results:  SUM(categories) (o-e)^2 / e
 *
 *	Difference is significant if value > chiinv(probability, 3 degrees of freedom)
 */
double chi_squared(const Results *baseline, const Results *alternative) { 

	const int N = 4;	// number of categories

	double expected[N] = { baseline->TP, baseline->FP, baseline->TN, baseline->FN };
	double observed[N] = { alternative->TP, alternative->FP, alternative->TN, alternative->FN };

	double chisquared = 0.0;

	for (int i=0; i<N; i++) { 
		double o_e = (double)(observed[i]) - expected[i];
		if (expected[i]==0.0 && observed[i]==0.0) {
			// do NOTHING.  0/0 = 0 for our purposes here
			//	(i.e. it doesn't contribute to the X^2 sum)
		} else if (expected[i]==0.0) {
			// this is nonzero/zero.  I'll consider this significant
			chisquared = INFINITY;
		} else {
			// normal case
			chisquared += (o_e * o_e) / expected[i];
		}
	}

	return chisquared;

}
		

/** handy subroutine:  Given a tree (and results to compare with), 
 *  1) make sure there is sufficient coverage change
 *	2) if so, add to queue (which will sort by results, delete if it can't fit inside)
 *	3) if not, delete it here.
 */
void enqueue(TreeHash *burned, Queue<Hypothesis> &queue, 
             Tree *tree, Results *tree_results, const Results *comp, const HypScoreCompare *compare,
             double chiinv, int S, int P, const SMMap &smmap) {

	#if COLLECT_EXPLORED_TREES
	if (!(burned->insert(tree))) {
		delete tree;
		delete tree_results;
		return;
	}
	#endif

	Hypothesis *hypothesis = new Hypothesis(tree, tree_results);

	if (chi_squared(comp, tree_results) > chiinv) { 	// perform quick Chi squared test
	
		// difference deemed statistically significant
		queue.add(hypothesis, compare); 
		
	} else {

		// not different enough.  Don't clutter beam with this
		delete hypothesis; 
	
	}

}

/**
 * Parameters to main algorithm
 */
struct Parameters {

	int K;		// beam 
	double R;	// min recall

	int C;		// num (max) conjuncts
	int D;		// num (max) disjuncts
	int G;		// num (max) negated
	bool oc;	// order constraints?
	bool dc;	// distance constraints?
	bool sc;	// strand constraints?

	double chiinv;	// significance difference during search (A chisquared value comparing results, 3 DOF)

};
ostream& operator<<(ostream &out, const Parameters &P) { 
	out << "C=" << P.C << ", "
	    << "D=" << P.D << ", "
	    << "G=" << P.G << ", "
		<< "oc=" << (P.oc ? "ON" : "OFF") << ", "
		<< "dc=" << (P.dc ? "ON" : "OFF") << ", "
		<< "sc=" << (P.sc ? "ON" : "OFF") << ", "
		<< "K=" << P.K << ", "
		<< "R=" << P.R;
	return out;
}



/** Train a model based on the given list of MotifInstance objects.
 *	M:	number of motifs (assumed to be indexed 0...M-1)
 *
 * SetInfo:
 *	S:  number of sequences (assumed to be indexed 0...S-1)
 *	P:	number of positive sequences (assumed to be indexed 0...P-1)
 *
 * Parameters:
 *	K:	beam width
 *	R:	minimum Recall
 *	C:	MAX branches
 *	D:	MAX motifs
 *
 *	T:	3 DOF Chi Squared test significance value
 *
 */
template <typename _MotifInstanceIterator>
Tree* train(_MotifInstanceIterator begin, _MotifInstanceIterator end, 
			int M, const SetInfo &setinfo, const Parameters &parameters,
			const vector<double> sw, 	// 2006/01/03:  sequence weights
			bool verbose=false) {

	const int S = setinfo.S;
	const int P = setinfo.P;

	assert(S==sw.size());	// TODO:  this is a debugging check

	// create SEQUENCE,MOTIF->INSTANCE map
	SMMap smmap = create_smmap(begin, end, S, M);

	// create hypothesis comparor to use with solutions-in-progress (contents of beam)
	HypScoreCompare compare;

	// create a list of valid motifs
	list<int> motifs;
	for (int m=0; m<M; m++) { motifs.push_back(m);  }

	// Create Solution "queue" (which only contains one element)
	Queue<Hypothesis> solution(1);

	// Create list of already-examined trees
	#if COLLECT_EXPLORED_TREES
	TreeHash treehash(TREEHASH_TABLE_SIZE);
	#else
	TreeHash treehash(0);	// create table of no size which will not be used anyway
	#endif
	TreeHash *burned = &treehash;
	
	// Create initial queue
	Queue<Hypothesis> queue1(parameters.K);
	Results *default_results = new Results(0, 0, 0, 0); // 2005-12-20
	for (list<int>::const_iterator m=motifs.begin(); m!=motifs.end(); m++) { 
		Tree *tree = new Tree();
		tree->add_conjunct(*m, S, smmap);
		Results *results = tree->evaluate(S, P, smmap, sw, 0);
		enqueue(burned, queue1, tree, results, default_results, &compare, parameters.chiinv, S, P, smmap);
	}
	delete default_results;	

	Queue<Hypothesis> queue2(parameters.K); // leave empty

	Queue<Hypothesis> *q1 = &queue1;
	Queue<Hypothesis> *q2 = &queue2;

	unsigned long solutions_examined = 0;

	// for each 'level', i'll compute the best solutions each level allows
	// different 'moves', e.g. negated regions may be computed only after a
	// previous set of good solutions is worked out.
	
	const int LEVELS = 2;	// perform operations on levels 0 and 1 only
	
	// LEVEL 0:  add branches, motifs, strand and order constraints, distance constraints
	// LEVEL 1:  add negated regions
	for (int level=0; level<LEVELS; level++) {


		if (verbose) { 
			if (level == 0) { cerr << "Phase " << level << ":  Search for structure..." << endl; }
			if (level == 1) { cerr << "Phase " << level << ":  Add negated motifs..." << endl; }
		}


		// while queue is not empty
		while (!q1->empty()) {

			solutions_examined++;


			// pop best off top.
			Hypothesis *current_hypothesis = q1->pop();
			Tree *current_tree = current_hypothesis->tree;
			Results *current_results = current_hypothesis->results;

			
			// CHANGE TO HYPOTHESIS:  Add distance constraints
			//	04/15/2005 NEW:  take midpoints of all RELEVANT (used) distances as posible distances
			//	NOTE:  We're still only adding ONE (highest scoring) tree with a new global and
			//			ONE tree per pairwise combination (not one per possible distance constraint)

			if (level == 0 && parameters.dc) {
		
				vector<vector<vector<pair<int, int> > > > pairwise;		// all used interbranch distances
				vector<pair<int, int> > global;							// all used ``global'', i.e. CRM<->TSS distances
				current_tree->get_distances(S, smmap, pairwise, global);	// ask tree to calculate them
				// dist to TSS
				std::sort(global.begin(), global.end(), ComparePairByFirstDesc<pair<int,int> >());

				double distance = optimize_distance(global.begin(), global.end(),
				                                     *current_results, current_tree->distance_constraint(), 
													 setinfo.P);

				Tree *alt = new Tree(*current_tree);
				alt->constrain_distance(distance);
				Results *results = alt->evaluate(S,P,smmap,sw,1); // 1 <=> constraining model
				enqueue(burned, *q1, alt, results, current_results, &compare, parameters.chiinv, S, P, smmap);
				
				for (int b1=0; b1<current_tree->size(); b1++) {
					for (int b2=b1+1; b2<current_tree->size(); b2++) {
		
						std::sort(pairwise[b1][b2].begin(), pairwise[b1][b2].end(), ComparePairByFirstDesc<pair<int,int> >());

						double distance = optimize_distance(pairwise[b1][b2].begin(), pairwise[b1][b2].end(),
															 *current_results, current_tree->distance_constraint(b1,b2), 
															 setinfo.P);
						Tree *alt = new Tree(*current_tree);
						alt->constrain_distance(b1,b2,distance);
						Results *results = alt->evaluate(S,P,smmap,sw,1); // 1 <=> constraining model
						enqueue(burned, *q1, alt, results, current_results, &compare, parameters.chiinv, S, P, smmap);
				
					}
				}
				
			}

			// CHANGE TO HYPOTHESIS:  Add order constraints
			if (level == 0 && parameters.oc) {
				for (int b1=0; b1<current_tree->size(); b1++) { 
					
					for (int b2=b1+1; b2<current_tree->size(); b2++) { 
						
						if (current_tree->order_constraint(b1, b2) == EITHER_ORDER) {
							Tree *alt;
							Results *results;

							alt = new Tree(*current_tree);
							alt->constrain_order(b1, b2);
							results = alt->evaluate(S,P,smmap,sw,1); // 1 <=> constraining model
							enqueue(burned, *q1, alt, results, current_results, &compare, parameters.chiinv, S, P, smmap);
							
							alt = new Tree(*current_tree);
							alt->constrain_order(b2, b1);
							results = alt->evaluate(S,P,smmap,sw,1); // 1 <=> constraining model
							enqueue(burned, *q1, alt, results, current_results, &compare, parameters.chiinv, S, P, smmap);
						}
					}
				}
			}


			// CHANGE TO HYPOTHESIS:  Add a strand constraint
			if (level == 0 && parameters.sc) {
				for (int bid=0; bid<current_tree->size(); bid++) { 
					if (current_tree->strand(bid) == EITHER_STRAND) { 
						
						Tree *alt;
						Results *results;

						alt = new Tree(*current_tree);
						alt->constrain_strand(bid, TEMPLATE_STRAND);
						results = alt->evaluate(S,P,smmap,sw,1); // 1 <=> constraining model
						enqueue(burned, *q1, alt, results, current_results, &compare, parameters.chiinv, S, P, smmap);
						
						alt = new Tree(*current_tree);
						alt->constrain_strand(bid, TRANSCRIBED_STRAND);
						results = alt->evaluate(S,P,smmap,sw,1); // 1 <=> constraining model
						enqueue(burned, *q1, alt, results, current_results, &compare, parameters.chiinv, S, P, smmap);
					}
				}
			}



			// CHANGE TO HYPOTHESIS:  Add a new motif
			if (level == 0) {
				for (int bid=0; bid<current_tree->size(); bid++) { 
					if (current_tree->size(bid) < parameters.D) { 
						for (list<int>::const_iterator m=motifs.begin(); m!=motifs.end(); m++) { 
							Tree *alt = new Tree(*current_tree);
							if (alt->add_disjunct(bid, *m, S, smmap)) {
								Results *results = alt->evaluate(S,P,smmap,sw,2); // 2 <=> relaxing model
								enqueue(burned, *q1, alt, results, current_results, &compare, parameters.chiinv, S, P, smmap);
							} else {
								// add_disjunct false b/c motif already there.
								delete alt;
							}
						}
					}
				}
			}

			// CHANGE TO HYPOTHESIS:  Add a new single-motif branch
			if (level == 0) {
				int current_B = current_tree->size();
				if (current_B < parameters.C) {
					for (list<int>::const_iterator m=motifs.begin(); m!=motifs.end(); m++) { 
						Tree *alt = new Tree(*current_tree);
						alt->add_conjunct(*m, S, smmap);	
						Results *results = alt->evaluate(S,P,smmap,sw,1); // 1 <=> constraining model
						enqueue(burned, *q1, alt, results, current_results, &compare, parameters.chiinv, S, P, smmap);
					}
				}
			}

		// --------------- LEVEL 1 -----------------------------

			// CHANGE TO HYPOTHESIS:  Add negated regions 
			if (level == 1) {

				for (list<int>::const_iterator m=motifs.begin(); m!=motifs.end(); m++) { 	
				
					// negate upstream
					if (current_tree->num_negated_regions(true) < parameters.G) {
						Tree *alt = new Tree(*current_tree);
						if (alt->negate(true, *m)) { 
							Results *results = alt->evaluate(S,P,smmap,sw,1); // 1 <=> constraining model
							enqueue(burned, *q1, alt, results, current_results, &compare, parameters.chiinv, S, P, smmap);
						} else {
							delete alt;
						}
					}
					
					// negate downstream
					if (current_tree->num_negated_regions(false) < parameters.G) { 
						Tree *alt = new Tree(*current_tree);
						if (alt->negate(false, *m)) { 
							Results *results = alt->evaluate(S,P,smmap,sw,1); // 1 <=> constraining model
							enqueue(burned, *q1, alt, results, current_results, &compare, parameters.chiinv, S, P, smmap);
						} else {
							delete alt;
						}
					}
	
					// negate between pairs
					for (int i=0; i<current_tree->size(); i++) {
					for (int j=i+1; j<current_tree->size(); j++) {

						if (current_tree->num_negated_regions(i, j) < parameters.G) { 
						
							Tree *alt = new Tree(*current_tree);
							if (alt->negate(i, j, *m)) {
								Results *results = alt->evaluate(S,P,smmap,sw,1);
								enqueue(burned, *q1, alt, results, current_results, &compare, parameters.chiinv, S, P, smmap);
							} else {
								delete alt;
							}
						}
					}}

				}
			}
					
		// --------------- LEVEL 2 -----------------------------
	
			// POSSIBLE LEVEL 2
			// TODO:  relax other constraints (and add option for this)

			// CHANGE TO HYPOTHESIS:  Remove branches
			if (level == 2) {
				if (current_tree->size() > 1) { 
					for (int b=0; b<current_tree->size(); b++) { 
						Tree *alt = new Tree(*current_tree);
						alt->remove(b);
						Results *results = alt->evaluate(S,P,smmap,sw,2); // 2 <=> relaxing model
						enqueue(burned, *q1, alt, results, current_results, &compare, parameters.chiinv, S, P, smmap);
					}
				}
			}
		
			// CHANGE TO HYPOTHESIS:  Remove motifs 
			if (level == 2) {
				for (int b=0; b<current_tree->size(); b++) { 
					if (current_tree->size(b) > 1) { 
						for (int m_index=0; m_index<current_tree->size(b); m_index++) { 
							Tree *alt = new Tree(*current_tree);
							alt->remove(b, m_index, S, smmap);
							Results *results = alt->evaluate(S,P,smmap,sw,1); // 1 <=> constraining model
							enqueue(burned, *q1, alt, results, current_results, &compare, parameters.chiinv, S, P, smmap);
						}
					}
				}
			}


		// ----------- END OF LEVELS ----------------------------

			// add copy to solution "queue"
			if (current_results->recall >= parameters.R) {	// all final solutions considered, recall >= R
			if (solution.empty() || current_results->score  >  solution.peek()->results->score) {
		
				if (verbose) { cerr << "new solution:  " << *current_tree << "\n\t" << *current_results << endl; }
				
				solution.add(new Hypothesis(new Tree(*current_tree), 
				             new Results(*current_results)), 
							 &compare);	

				// if solution is P=1 R=1, guess we should stop looking
				if (solution.peek()->results->recall==1.0 && solution.peek()->results->precision==1.0) {
					delete current_hypothesis;	// but, since we just added a COPY to the solution queue, 
												//	we still need to delete the current copy of it.
					break; 
				}
			}}
	
			// add current to queue for next level 
			//	04/11/2005 moved from above:  must Queue.add() might delete its addend, 
			//	so must add copy before adding (and possibly deleting) original 
			//	(rarely causes a seg fault)
			q2->add(current_hypothesis, &compare);

		} // while !q1->empty()


		// if solution is P=1 R=1, guess we should stop looking
		if (solution.peek()->results->recall==1.0 && solution.peek()->results->precision==1.0) { break; }


		// get ready for next level

		// Consider limiting motif list between levels?
			/*
			vector<bool> mincl(M);
			std::fill(mincl.begin(), mincl.end(), false);
			for (list<Hypothesis*>::const_iterator i=q2->begin(); i!=q2->end(); i++) { 
				for (int b=0; b<(*i)->tree->size(); b++) { 
					for (list<int>::const_iterator m=(*i)->tree->motif_begin(b); m!=(*i)->tree->motif_end(b); m++) { 
						mincl[*m] = true; } } }
			motifs.clear();
			for (int m=0; m<M; m++) { if (mincl[m]) { motifs.push_back(m); } } */

			

		// swap queues
		Queue<Hypothesis> *tmp = q1;
		q1 = q2;
		q2 = tmp;

	} // next level

	if (verbose) { cerr << solutions_examined << " solutions examined." << endl; }

	// what's my final answer?
	Hypothesis *hypothesis;
	if (solution.empty()) {
		cerr << "WARNING:  No solution found, recall >= " << parameters.R << "." << endl;
		hypothesis = new Hypothesis(new Tree(), new Results(P, S-P, 0, 0));	 // in future, Results() may need 1 more param (4/8/5)
	} else {
		hypothesis = solution.pop();	
	}

	// print the final solution
	if (verbose) { cerr << "Final solution:  " << *hypothesis << endl; }

	Tree *ans = new Tree(*(hypothesis->tree));
	delete hypothesis;

	return ans;


} // TRAIN



/** TEST
	tree = current hypothesis
	begin,end = iterators through MotifInstance objects { sid, mid, pos, strand }
	SetInfo = struct { int S, int P }; (that's it)
	M = number of motifs
	sw = sequence weights (parallel to sequences)
	squal = if a pointer is given, copy qualifying sequences to this vector

*/
template <typename _MotifInstanceIterator>
Results test(Tree *tree, _MotifInstanceIterator begin, _MotifInstanceIterator end, 
			 const SetInfo &setinfo, const int M,
			 const vector<double> &sw,
			 vector<bool> *squal = NULL,
			 vector<list<vector<int> > > *passlist = NULL) {

	// create SEQUENCE,MOTIF->INSTANCE map
	SMMap smmap = create_smmap(begin, end, setinfo.S, M);

	assert(setinfo.S==sw.size());	// TODO:  this is a debugging check

	tree->reset(setinfo.S, smmap);
	Results *results = tree->evaluate(setinfo.S, setinfo.P, smmap, sw, 0, passlist);

	// call to Tree::evaluate guarantees that Tree::squal is up-to-date with these data

	if (squal) { tree->copy_squal(squal); }

	Results ans(*results);	// non-pointer version
	delete results;

	return ans;

} // test


/**
 * Based on fold f of N-fold C.V., remove the appropriate sequences
 *	from "trainset" and add them to "testset"
 *	IMPORTANT:  sequence IDs in trainset and testset will be renumbered
 *	so they are contiguous from 0 and trainP testP will be set to the
 *	number of positive instances in each set.
 */
void cfv_divide(int N, int f, const SetInfo &setinfo,
				list<MotifInstance> &trainset, SetInfo &train_setinfo,
				list<MotifInstance> &testset, SetInfo &test_setinfo) {

	// re-map sequence IDs
	vector<int> setmap(setinfo.S);	// old-seq-ID -> new-seq-ID
	train_setinfo.S = test_setinfo.S = 0;
	for (int s=0; s<setinfo.S; s++) {

		if (s==setinfo.P) { train_setinfo.P = train_setinfo.S;  test_setinfo.P = test_setinfo.S; }
	
		if (s % N == f) { 
			setmap[s] = test_setinfo.S++;
		} else {
			setmap[s] = train_setinfo.S++;
		}
	}

	list<MotifInstance>::iterator i = trainset.begin();
	while (i != trainset.end()) { 


		if (i->sid % N == f) { 
			// testset
			i->sid = setmap[i->sid];
			testset.push_back(*i);	// copy
			i = trainset.erase(i);
		} else {
			i->sid = setmap[i->sid];
			i++;
		}
	}
  
	return;
}



/** run N-fold cross-validation, and report results on a test/tune set
 *	assume vector length for data and setinfo == number of folds
 */
Results cross_fold_sum(const vector<MotifInstanceSet*> &set,
                       const Parameters &parameters,
                       const vector<double> &sw) {

	const int N = set.size();

	double TP=0, FP=0, TN=0, FN=0;

	MotifInstanceSet trainset, testset;
	
	for (int fold=0; fold<N; fold++) {

		trainset.data = set[fold]->data;	// copy, overwrite old
		testset.data.clear();
		trainset.M = testset.M = set[fold]->M;
		cfv_divide(N, fold, set[fold]->info, trainset.data, trainset.info, testset.data, testset.info);

		// stratefy devide sequence weights
		vector<double> train_sw;
		vector<double> test_sw;
		double test_sum=0, train_sum=0;
		for (int s=0; s<sw.size(); s++) { 
			if (s % N == fold) { 
				test_sw.push_back(sw[s]);
				test_sum += sw[s];
			} else {
				train_sw.push_back(sw[s]);
				train_sum += sw[s];
			}
		}
		for (int s=0; s<train_sw.size(); s++) { train_sw[s] /= (train_sum/train_sw.size()); }
		for (int s=0; s<test_sw.size(); s++) { test_sw[s] /= (test_sum/test_sw.size()); }

		Tree *H = train(trainset.data.begin(), trainset.data.end(), trainset.M, trainset.info, parameters, train_sw, false);
																				// verbose==false, true only for
																				//	debugging

		Results results = test(H, testset.data.begin(), testset.data.end(), testset.info, testset.M, test_sw);

		TP += results.TP;
		FP += results.FP;
		TN += results.TN;
		FN += results.FN;

		delete H;
	}

	return Results(TP,FP,TN,FN); // in future, may need 1 more param (4/8/5)
	
}


/** 
 * IMMEDIATELY_FIX_GAIN_CLIMB:
 *
 *	when searching for the best search parameters, I have two methods:
 *
 *	1 (slow):  for each possible change, try them all and make the best choice
 *	2 (fast):  any possible change that's an improvement, make it, fix it, and keep going (IMMEDIATELY_FIX_GAIN_CLIMB)
 */
#define IMMEDIATELY_FIX_GAIN_CLIMB 0


/**
 * Subroutine:  train/test an alternative (relaxed) set of parameters
 *              keep the new relaxed set if:
 *                  1) the old (current) set isn't a STATISTICALLY SIGNIFICANT improvement
 *                  2) the new (relaxed) results are the best found so far
 */
void tune_compare(bool &found_improvement, const Parameters &alt_parameters, double chiinv,
					Results* &current_results, Parameters &current_parameters,
					Results* &best_results, Parameters &best_parameters,
					const vector<MotifInstanceSet*> &set,
					const vector<double> &sw, bool verbose) {


	Results r = cross_fold_sum(set, alt_parameters, sw);

	// keep expressive model if:
	//	1) the score is better 
	//	2) it is significantly different
	bool keep_ex = (current_results->score > r.score && chi_squared(&r, current_results) > chiinv);
	
	// special case:  keep expressive model if the score is !nan and relaxed score is nan.
	keep_ex = keep_ex || (isnan(r.score) && (!(isnan(current_results->score))));

	// special case:  if BOTH are nan, best to RELAX
	if (isnan(r.score) && isnan(current_results->score)) { keep_ex = false; }

	if (keep_ex) { 
		// best to KEEP more expressive model
		if (verbose) { cerr << "keep more expressive model (" << r << ")\n"; }

	} else {
		// best to RELAX the model
		if (verbose) { cerr << "relax model"; }
		
		#if IMMEDIATELY_FIX_GAIN_CLIMB
		found_improvement = true;
		delete current_results;
		current_results = new Results(r);
		current_parameters = alt_parameters;

		#else
		if (!found_improvement || r.score > best_results->score || (!isnan(r.score) && isnan(best_results->score))) {
			if (verbose) { cerr << " (best, " << r << ")\n"; }
			found_improvement = true;
			if (best_results) { delete best_results; }
			best_results = new Results(r);
			best_parameters = alt_parameters;	// copy
		} else {

			if (verbose) { cerr << " (but not best, " << r << ")\n"; }
		}

		#endif

	}

	return;
}


Parameters tune(const Parameters &init_parameters,
				const vector<MotifInstanceSet*> &set,
				double chiinv, bool tuneMBS,
				const vector<double> &sw, 
				bool verbose=false) {

	const string VTAB = "  ";

	if (verbose) { cerr << "TUNE MODEL:" << endl; }

	const int TUNE_FOLDS = set.size();

	// TODO:  make this a different parameter?
	const int TUNE_BEAM_WIDTH = init_parameters.K;

	Parameters current_parameters = init_parameters;
	current_parameters.K = TUNE_BEAM_WIDTH;
	Results *current_results;

	// establish results with current parameters
	Results init_results = cross_fold_sum(set, current_parameters, sw);
	current_results = new Results(init_results);

	// currently, we have some parameters and some results.
	//	we'll keep checking for a way to IMPROVE our results
	//	by relaxing the expressivity of the model 

	int loop = 0;	// for verbose output
	bool found_improvement = true;
	while (found_improvement) {

		if (verbose) { 
			cerr << endl;
			cerr << VTAB << "loop " << ++loop << endl;
			cerr << VTAB << "current parameters:  " << current_parameters << endl; 
			cerr << VTAB << "current results:  " << *current_results << endl; 
		}

		found_improvement = false;	// tentative
		
		Results *best_results = NULL;
		Parameters best_parameters;

		// 2005-11-21:  consider multiple binding sites part of  tunable aspect space?
		if (tuneMBS) { 

			// C (Conjuncts)
			for (int c=current_parameters.C-1; c>=1; c--) {
				if (verbose) { cerr << VTAB << VTAB << c << " conjuncts..."; }
				Parameters alt_parameters = current_parameters;
				alt_parameters.C = c;
				tune_compare(found_improvement, alt_parameters, chiinv,
							 current_results, current_parameters, best_results, best_parameters, set, sw, verbose);
			}
		}

		// D (Disjuncts)
		for (int d=current_parameters.D-1; d>=1; d--) { 
			if (verbose) { cerr << VTAB << VTAB << d << " disjuncts..."; }
			Parameters alt_parameters = current_parameters;
			alt_parameters.D = d;
			tune_compare(found_improvement, alt_parameters, chiinv, 
						 current_results, current_parameters, best_results, best_parameters, set, sw, verbose);
		}
	
		// G (Negated Regions)
		for (int g=current_parameters.G-1; g>=0; g--) { 
			if (verbose) { cerr << VTAB << VTAB << g << " negated regions..."; }
			Parameters alt_parameters = current_parameters;
			alt_parameters.G = g;
			tune_compare(found_improvement, alt_parameters, chiinv, 
						 current_results, current_parameters, best_results, best_parameters, set, sw, verbose);
		}

		// sc (Strand constraint)
		if (current_parameters.sc) { 
			if (verbose) { cerr << VTAB << VTAB << "strand constraints off..."; }
			Parameters alt_parameters = current_parameters;
			alt_parameters.sc = false;
			tune_compare(found_improvement, alt_parameters, chiinv, 
						 current_results, current_parameters, best_results, best_parameters, set, sw, verbose);
		}

		// dc (Distance constraints)
		if (current_parameters.dc) { 
			if (verbose) { cerr << VTAB << VTAB << "distance constraints off..."; }
			Parameters alt_parameters = current_parameters;
			alt_parameters.dc = false;
			tune_compare(found_improvement, alt_parameters, chiinv, 
						 current_results, current_parameters, best_results, best_parameters, set, sw, verbose);
		}

		// oc (Order constraints)
		if (current_parameters.oc) { 
			if (verbose) { cerr << VTAB << VTAB << "order constraints off..."; }
			Parameters alt_parameters = current_parameters;
			alt_parameters.oc = false;
			tune_compare(found_improvement, alt_parameters, chiinv, 
						 current_results, current_parameters, best_results, best_parameters, set, sw, verbose);
		}

			
		// Done checking for improvements
		
		#if IMMEDIATELY_FIX_GAIN_CLIMB
			// don't need to do anything, just keep looping if change was made
		#else
			if (found_improvement) {
				delete current_results;
				current_results = best_results;	// don't need to delete best results
				current_parameters = best_parameters;
			}
		#endif

	} // while found_improvement

	delete current_results;
	return current_parameters;


}


// read mif file
bool read_mif_file(istream &mif_in, list<MotifInstance> &mil, int &hi_mid, int &hi_sid) {

	mil.clear();
	unsigned long loops = 0;
	while (mif_in.good()) { 	
		WSS wss;
		string dummy;
		int sid,mid,pos,strand;
		mif_in >> sid >> mid >> pos >> strand >> wss;
		if (mid > hi_mid) { hi_mid = mid; }
		if (sid > hi_sid) { hi_sid = sid; }
		mil.push_back(MotifInstance(sid,mid,pos,strand));
		while (!wss.eol) { mif_in >> dummy >> wss; }	// skip rest of line
			// NOTE:  for the version of MIF as of 2004-02-14, the rest of
			//	line should be log-match/background ratio, match-sequence, tmpl-strand-match-sequence.
	}
	return true;
}

bool read_mif_file(string filename, list<MotifInstance> &mil, int &hi_mid, int &hi_sid) { 

	ifstream mif_in(filename.c_str());
	if (!mif_in) {
		return false; 
	} else {
		return read_mif_file(mif_in, mil, hi_mid, hi_sid);
	}
	mif_in.close();
}

/****
string passlist_ranges(const list<vector<int> > &instances) {

	if (instances.begin() == instances.end()) { return "empty set"; }

	string ans = "";
	int max_range = -1;	// range of 0 okay for single-motif CRMs
	int min_range = (int)(uint(-1)/2);	// max signed integer

	for (list<vector<int> >::const_iterator it=instances.begin(); it!=instances.end(); it++) {

		// it -> [ a , b , c ], positions of the motif that satisfies each branch
		assert(it->size());

		int min, max;
		min = max = *(it->begin());
		
		for (vector<int>::const_iterator L=it->begin(); L!=it->end(); L++) { 
			min = std::min(min, *L);
			max = std::max(max, *L);
		}

		int range = abs(max - min);

		//if (range < min_range)  
		if (range > max_range) { 
			ostringstream oss;
			oss << "[" << min; if (min < max) { oss << "..." << max; }  oss << "]";
			ans = oss.str();
		} else if (range == max_range) {
			ostringstream oss;
			oss << " or [" << min; if (min < max) { oss << "..." << max; }  oss << "]";
			ans = ans + oss.str();
		}
		min_range = std::min(min_range, range);
		max_range = std::max(max_range, range);
	}

	return ans;

}
******/

string passlist_ranges(const list<vector<int> > &instances) {

	if (instances.begin() == instances.end()) { return "empty set"; }

	int min_element =  (int)(uint(-1)/2);	// max signed integer
	int max_element = -(int)(uint(-1)/2);	// min signed integer

	string ans = "";

	for (list<vector<int> >::const_iterator it=instances.begin(); it!=instances.end(); it++) { 

		// it -> [ a , b , c ], positions of the motif that satisfies each branch
		assert(it->size());

		for (vector<int>::const_iterator L=it->begin(); L!=it->end(); L++) { 
			min_element = std::min(min_element, *L);
			max_element = std::max(max_element, *L);
		}

	}

	ostringstream oss;
	oss << "[" << min_element << "..." << max_element << "]";

	return oss.str();

}




/** subroutine to print out predictions of train or test set (print:  seqID "train"/"test" ACTUAL PREDICTED) */
void print_results(ostream &out, bool testset, uint N, uint f, const SetInfo &setinfo, const vector<bool> &squal, 
					const vector<list<vector<int> > > *passlist = NULL) {

	const string RS = "\t";	// record separator
	const string set = testset ? "test" : "train";

	uint squal_id = 0;

	for (uint s=0; s<setinfo.S; s++) {
		
		if ((s % N == f) == testset) {
			out << s << RS
			    << set << RS
				<< (s < setinfo.P ? '+' : '-') << RS
				<< (squal[squal_id] ? '+' : '-') << RS;
			if (passlist && squal[squal_id]) { 
				out << (*passlist)[squal_id].size() << ", " << (passlist_ranges((*passlist)[squal_id])) << RS;
			}
			out << endl;
			squal_id++;
		}
	}
}
	
int main(int argc, char **argv) {

	// algo parameters
	Parameters parameters;

	// File I/O
	string crm_in_file, crm_out_file;

	// learning/tuning parameters
	bool learn_aspects;
	double tune_chiinv;
	bool tuneMBS;
	int tune_folds;
	string tune_mif_file_pattern;
	string DEFAULT_TUNE_MIF_FILE_PATTERN = "";
	string TUNE_MIF_FILE_FOLD_SYMBOL = "#";
	
	// CFV parameters
	int N,f;
	string mif_file, motif_file, sw_file;

	// data set parameters
	MotifInstanceSet trainset, trianprimeset, testset;
	SetInfo setinfo;	// input data set information

	// verbose output (to stderr) throughout the program
	bool verbose;	// verbose output
	
	string comment;
	
	ostringstream oss;
	oss << PROGRAM_VERSION << ":  "
		<< "Learn a cis-regulatory module (CRM) from instances of motif occurences.";
	
	OptionParser parser(oss.str());
	parser.add("mif", 'f', &mif_file, "Motif instance file.  Each line as:  sequenceID  motifID  postition  strand(1=TMPL,2=TCX)  [comments...]");
	parser.add("motif", 'm', &motif_file, "", "Motifs file (FASTA format), mapping motif ID to consensus (for program output).  Zero origin.");
	parser.add("sw", 'W', &sw_file, "", "Sequence-weight file.  One value per sequence, in order, separated by white space");

	parser.add("write", 'w', &crm_out_file, "", "Write learned model to this file");
	parser.add("read", 'r', &crm_in_file, "", "Read CRM model from this file instead of training");

	parser.add("sequences", 'S', &setinfo.S, "Number of sequences (numbered 0 through S-1)");
	parser.add("positive", 'P', &setinfo.P, "Number of positive examples (numbered 0 through P-1).  The first P sequences are the positive sequences.  The rest (P through S-1) are negative examples.");
	
	parser.add("beam", 'K', &parameters.K, 10, "Beam Width");
	parser.add("recall", 'R', &parameters.R, 0.0, "Required recall (Do not return a solution with recall smaller than this value.");
	
	parser.add("maxc", 'C', &parameters.C, 3, "Maximum number of binding sites in CRM");
	parser.add("maxd", 'D', &parameters.D, 3, "Maximum number of motifs per binding site in CRM");
	parser.add("nooc", 'o', &parameters.oc, true, "Do not use order constraints");	// parameters.oc <=> DO USE order constraints, etc.
	parser.add("nodc", 'd', &parameters.dc, true, "Do not use distance constraints");
	parser.add("nosc", 's', &parameters.sc, true, "Do not use strand constraints");
	parser.add("maxn", 'G', &parameters.G, 1, "Maximum number of motifs per negated region in CRM");

	parser.add("chiinv", 'V', &parameters.chiinv, 0.0, "Chi squared table value for distinquishing models during search (3 degrees of freedom)");

	parser.add("folds", 'N', &N, 1, "Cross-fold validation folds (nonpositive <=> no cross-fold validation)");
	parser.add("fold", 'F', &f, (uint)-1, "Cross-fold validation fold (if data size mod folds = fold, example is test set)");

	parser.add("noasearch", 'a', &learn_aspects, true, "Do not learn CRM aspect space");
	parser.add("tfolds", 'T', &tune_folds, 10, "Tuning folds in cross-fold-validation during aspect learning");
	parser.add("noclimit", 'c', &tuneMBS, true, "Do not reduce maximum number of binding sites while selecting aspects");
	parser.add("tchiinv", 'x', &tune_chiinv, 2.3659727453222, "Chi squared table value for distinquishing models during aspect learning (comparing TP/FP/TN/FN, 3 DOF.  For P=0.01 <=> x=11.344882119471, P=0.05 <=> x=7.8147247029009, P=0.1 <=> x=6.2513944516967, P=0.25 <=> 4.108342113366, P=0.5 <=> 2.3659727453222, P=0.75 <=> 1.21253205702341, P=0.9 <=> 0.114831620490197).");
	parser.add("tmif", 't', &tune_mif_file_pattern, DEFAULT_TUNE_MIF_FILE_PATTERN, "File pattern for all TUNE set .mif files (program will replace '#' with tune fold ID starting at zero)");

		/* Possible default values:	probability	value (as calc. by Excel's CHIINV(prob,dof) function)
		                            ----------- ------------------
									1       0.0000465661287307739
									0.999   0.0242984860676643
									0.99    0.114831620490197
									0.9     0.584375459244772
									0.75    1.21253205702341
									0.5     2.3659727453222
									0.25    4.108342113366
									0.1     6.2513944516967
									0.05    7.8147247029009
									0.01    11.344882119471
									0.001   16.2659591723013
									0.0001  21.104018392748
		*/

	
	//parser.add("verbose", 'v', &verbose, true, "Print extra information (stderr):  Train/Tune progress, etc.");
	parser.add("quiet", 'q', &verbose, true, "Don't print extra information (to stderr).  This is trainset/tuneset progress, etc.");	// 2006/01/03:  Changed default to true

	parser.add("comment", 'X', &comment, "", "Comment (ignored)");
	
	vector<string> args = parser.parse(argc, argv);

	// should be no non-option arguments.
	if (args.size()) { 
		cerr << "Unexpected argument(s):  `" << args[0] << "'" << endl;
		exit(-1);
	}

	if (N<=0) { N=1; f=N; }	// make CFV var's legal values (N=0 is bad because I divide by it)
	const bool CFV = (N>0 && 0 <= f && f < N);

	// read mif file 
	int hi_mid = -1;
	int hi_sid = -1;
	read_mif_file(mif_file, trainset.data, hi_mid, hi_sid);
	if (hi_sid >= setinfo.S) { 
		cerr << "read sequence ID out-of-range (ID = " << hi_sid << " >= " << setinfo.S << " = S)" << endl;
		exit(-1);
	}


	// read motifs
	list<pair<string, string> > motifs;
	if (motif_file == "") { 

		// 2005/07/20, if no motif_file given, use "generic motifs"
		for (int m=0; m<=hi_mid; m++) { 
			ostringstream oss;
			oss << m;
			motifs.push_back(pair<string,string>("> motif ", oss.str()));
		}
		
	} else {
		
		ifstream motif_in(motif_file.c_str());
		if (!motif_in) { 
			cerr << "Cannot open motif file, \"" << motif_file << "\"." << endl;
			exit(-1);
		}
		while (motif_in.good()) {
			motifs.push_back(pair<string,string>("",""));
			fasta_code fc = fasta_read(motif_in, motifs.back().first, motifs.back().second);
			if (fc != fasta_success) { motifs.pop_back(); }
		}
		motif_in.close();
	}
	const int M = motifs.size();
	trainset.M = testset.M = M;

	// check on motif file (vs. mif file)
	if (hi_mid + 1 > M) { 
		cerr << argv[0] << ":  Read more motif IDs in motif instance file "
		     << "(" << (hi_mid+1) << " in \"" << mif_file << "\") "
			 << "than in motif list " 
			 << "(" << M << " in \"" << motif_file << "\")." << endl; 
		exit(-1); 
	}

	// echo options:
	cerr << endl;
	cerr << PROGRAM________ << endl
		 << PROGRAM_VERSION << endl
		 << PROGRAM________ << endl
		 << "Motifs:  " << M << " (" << motif_file << ")" << endl
		 << "Motif Instances:  " << trainset.data.size() << " (" << mif_file << ")" << endl
		 << "Sequence weights:  " << (sw_file=="" ? "No file given (All 1's)" : sw_file.c_str()) << endl
		 << "Max. branches:  " << parameters.C << endl
		 << "Max. motifs:  " << parameters.D << endl
		 << "Max. negated motifs:  " << parameters.G << endl
		 << "Beam width:  " << parameters.K << endl
		 << "Required (minimum) recall:  " << parameters.R << endl
		 << "Use strand constraints?  " << (parameters.sc ? "YES" : "NO") << endl
		 << "Use order constraints?  " << (parameters.oc ? "YES" : "NO") << endl
		 << "Use distance constraints?  " << (parameters.dc ? "YES" : "NO") << endl
		 << "Chi squared value for evaluating model distintion during search:  " << parameters.chiinv << endl
		 << "Learn aspect space?  " << (learn_aspects ? "YES" : "NO") << endl
		 << "Tune CFV folds:  " << tune_folds << endl
		 << "Allow reduction of max. binding sites?  " << (tuneMBS ? "YES" : "NO") << endl
		 << "Chi Squared value for parameter choosing:  " << tune_chiinv << endl
		 << "N = " << N << ", F = " << f 
		 << ((CFV) ? " (One fold of cross-fold validation)" : " (Train on entire data set)") << endl
		 ;
	
	cerr << "--" << endl << "Options:  ";
	parser.dump(cerr, "=", ", ", true);	// dump(stream,name/value separator,delim,terse);
	cerr << endl << "--" << endl;

	vector<double> sw;
	if (sw_file != "") {
		ifstream fin(sw_file.c_str());
		if (!fin) { cerr << "Cannot open sequence weights file, \"" << sw_file << "\"." << endl;  exit(-1); }
		while (fin.good()) {
			double w;
			fin >> w >> ws;
			sw.push_back(w);
		}
		fin.close();
		if (sw.size() != setinfo.S) { 
			cerr << "Error:  data set size (" << setinfo.S << ") not equal to number of sequence weights (" 
				 << sw.size() << ")." << endl;  exit(-1); 
		}
		double sum = 0;
		for (int s=0; s<setinfo.S; s++) { sum += sw[s]; }
		for (int s=0; s<setinfo.S; s++) { sw[s] /= (sum/setinfo.S); }	// average sw[i] = 1

	} else {
		sw.resize(setinfo.S);
		for (int s=0; s<setinfo.S; s++) { sw[s] = 1; }
	}



	vector<double> train_sw;
	vector<double> test_sw;

	if (CFV) { 
		// separate out testset from trainset
		cfv_divide(N, f, setinfo,
					trainset.data, trainset.info,
					testset.data, testset.info);

		// stratefy-divide train/test sequence weights

		for (int s=0; s<setinfo.S; s++) { 
			if (s % N == f) { 
				// test 
				test_sw.push_back(sw[s]);
			} else {
				train_sw.push_back(sw[s]);
			}
		}
			
		double sum = 0;
		for (int s=0; s<train_sw.size(); s++) { sum += train_sw[s]; }
		for (int s=0; s<train_sw.size(); s++) { train_sw[s] /= (sum/train_sw.size()); }	// sum to size (avg. sw[i] = 1)
		
		sum = 0;
		for (int s=0; s<test_sw.size(); s++) { sum += test_sw[s]; }
		for (int s=0; s<test_sw.size(); s++) { test_sw[s] /= (sum/test_sw.size()); }
		
	} else {

		train_sw.resize(setinfo.S);	std::copy(sw.begin(), sw.end(), train_sw.begin());
		test_sw.clear();

		trainset.info = setinfo;	// copy

		testset.info.S = testset.info.P = 0;
		testset.data.clear();
	}


	Tree *H;	// Hypothesis

	if (crm_in_file != "") { 
		
		// read CRM instead of learning
		ifstream fin(crm_in_file.c_str());
		if (!fin) { cerr << "Cannot read CRM model input file " << crm_in_file << endl;  exit(-1); }
		H = new Tree(fin);

	} else {

		// learn parameters for final training
		if (learn_aspects) {

			vector<MotifInstanceSet> tune_sets(tune_folds);

			vector<MotifInstanceSet*> set(tune_folds);
			for (int fold=0; fold<tune_folds; fold++) { set[fold] = NULL; }

			if (tune_mif_file_pattern != DEFAULT_TUNE_MIF_FILE_PATTERN) { 
			
				for (int fold=0; fold<tune_folds; fold++) { 
					string filename = tune_mif_file_pattern;
					bool good_pattern = replace(filename, TUNE_MIF_FILE_FOLD_SYMBOL, tostring(fold));
					if (!good_pattern) { 
						cerr << "Warning:  pattern \"" << tune_mif_file_pattern 
							 << "\" does not contain substitution symbol \"" 
							 << TUNE_MIF_FILE_FOLD_SYMBOL << "\"." << endl;
					}
					tune_sets[fold].data.clear();
					int hi_mid = -1, hi_sid = -1;
					bool good_read = read_mif_file(filename, tune_sets[fold].data, hi_mid, hi_sid);
					if (!good_read) {
						cerr << "Error:  Can't read tuneset .mif file \"" << filename << "\"" << endl;
						exit(-1);
					}

					// You just read in a FULL dataset (complete with TEST sequence(s)).
					// still need to remove them.
					if (CFV) { 
						MotifInstanceSet dummy_testset;
						cfv_divide(N, f, setinfo, tune_sets[fold].data, tune_sets[fold].info,
												  dummy_testset.data, dummy_testset.info);
						// okay, good to go
					}
					
					tune_sets[fold].info.P = trainset.info.P;
					tune_sets[fold].info.S = trainset.info.S;
					tune_sets[fold].M = hi_mid+1;
					set[fold] = &(tune_sets[fold]);

				}

			} else {

				// create a set of (the same) training data info
				for (int fold=0; fold<tune_folds; fold++) { set[fold] = &(trainset); }

			}

			parameters = tune(parameters, set, tune_chiinv, tuneMBS, train_sw, verbose);	// verbose==true <=> verbose output during tune phase
			
			cerr << "Learned CRM aspect space:" << endl
				 << " Max. binding sites = " << parameters.C << endl
				 << " Max. motifs / binding site = " << parameters.D << endl
				 << " Allow distance constraints = " << (parameters.dc ? "Yes" : "No") << endl
				 << " Allow order constraints = " << (parameters.oc ? "Yes" : "No")  << endl
				 << " Allow strand constraints = " << (parameters.sc ? "Yes" : "No")  << endl
				 << " Max. motifs / negated region = " << parameters.G << endl
				 ;
			
		}

		// train using parameters either given or learned above
		H = train(trainset.data.begin(), trainset.data.end(), trainset.M, trainset.info, parameters, train_sw, verbose);
	
	} // if reading from file / training new CRM model

	// write solution to file
	if (crm_out_file != "") { 
		ofstream fout(crm_out_file.c_str());
		if (!fout) {
			cerr << "Could not open CRM model output file " << crm_out_file << " for writing." << endl; 
		} else {
			H->write(fout);
			fout.close();
		}
	}
	
	// print the solution
	vector<string> motif_identifiers;
	motif_identifiers.reserve(M);
	for (list<pair<string, string> >::const_iterator i=motifs.begin(); i!=motifs.end(); i++) {
		// i->first = FASTA header, e.g. "> header header header..."
		// i->second = actual sequence, e.g. "acacagtGATCCCCCGCGCatgggtttt"
		ostringstream oss;
		oss << i->second << ":  " << i->first;	
			// Example:
			// "aggtCATGTGGCaag:  > R00853 IgH (immunoglobulin heavy chain); Gene: G000537. sequence 1 of 1"
		motif_identifiers.push_back(oss.str());
	}
	cerr << endl;
	H->print(cerr, motif_identifiers);

	vector<bool> train_squal, test_squal;	// which sequences in each set qualify as positive predictions

	// print training set results (noto, 2006-03-31)
	{
		// cerr << ((trainset.info.S == 0) ? "(Trainset empty.)" : "Trainset results:") << endl;
		vector<list<vector<int> > > passlist(trainset.data.size());
		Results results = test(H, trainset.data.begin(), trainset.data.end(), trainset.info, trainset.M, train_sw, &train_squal, &passlist);
		cerr << "Trainset results (Seq. Set Class Prediction):  " << results << endl;
		print_results(cout, false, N, f, setinfo, train_squal, &passlist);
	}
	
	// if we're doing cross-fold-validation, test on testset and print results (noto, 2006-03-31)
	{
		vector<list<vector<int> > > passlist(testset.data.size());
		Results results = test(H, testset.data.begin(), testset.data.end(), testset.info, testset.M, test_sw, &test_squal, &passlist);
		cerr << "Testset results (Seq. Set Class Prediction):  " << results << endl;
		print_results(cout, true, N, f, setinfo, test_squal, &passlist);
	}

	delete H;

	#if DEBUG_MALLOC_CHECK
	malloc_check_stats(cerr);
	#endif

	return 0;

} // main


