/*

	Functionality for generating a gaussian mixture model from a set of data (double values)

	Uses the EM algorithm to set converge to a good mixture
	
	Uses random restarts to find the best mixture
		first run positions gaussians uniformally in the range of the data
		subsequent runs position gaussians randomly (number of restarts given by user)
	
	Uses cross-validation to decide whether or not a certain number of Gaussians is better than another:
		first sets up a mixture of "MINIMUM NUMBER" gaussians (this is the best-so-far)
		then, for each possible number-of-gaussians
			if sum-probablity-density is significantly better on held-aside data than the best-so-far
				reset a new best-so-far


	Usage:

		GMGenerator gmg(min-num-gaussians, max-num-gaussians, folds, em_iterations, random_restarts, 
					    confidence-threshold, gaussian-constraint-functor);
						
		GaussianMixture gm = gmg.generate(mydata.begin(), mydata.end());
		

*/

#ifndef GM_GENERATOR_H
#define GM_GENERATOR_H 0

#include "gaussian.h"
#include "TTest.h"

#include <functional>
using namespace std;

struct GM_Noop;			// "constrain" functor that doesn't do anything (default)

/** simple structure for various statistics for a set of reals */
struct real_data_statistics { 
	double mean;
	double stdev;
	double min;
	double max;
	int count;
};

class GMGenerator {
  
  private:

	/** divide data set into train/test folds */
	template <typename T, typename _T_Iterator>
	static void divide(_T_Iterator begin, _T_Iterator end, vector<vector<T> > &trainsets, vector<vector<T> > &testsets, int folds);

	/** get statistics about a set of doubles */
	template <typename _Ditty> // Double Iterator
	static real_data_statistics get_stats(_Ditty begin, _Ditty end);

	/** compare two mixtures on a set of data and keep the best one 
		(if compare is better, set keeper=compare)
		(if compare is better, set keeper_density = compare.log_density(...) 
	*/
	template <typename _Ditty> // Double Iterator
	static void compare(GaussianMixture &keeper, GaussianMixture &compare, _Ditty begin, _Ditty end);
	template <typename _Ditty> // Double Iterator
	static void compare(GaussianMixture &keeper, GaussianMixture &compare, double &keeper_density, _Ditty begin, _Ditty end);

	/** set up a mixture model depending on where the data are */	
	static void init_uniform(GaussianMixture &gm, const real_data_statistics &stats);
	static void init_random(GaussianMixture &gm, const real_data_statistics &stats);
	
	/** use cross-validation to cluster Gaussians in a mixture about a data set
		calls:  random_restart_em 
	*/
	template <typename _Ditty, typename _GM_Constraints> // Double Iterator
	vector<double> cross_validation_cluster(GaussianMixture &gm, _Ditty begin, _Ditty end, 
	                                        const _GM_Constraints &constrain,
											vector<GaussianMixture> &trained_mixtures);
												   
	/** run several random restarts of the EM algorithm and keep the best on the training set 
		calls:  em
	*/
	template <typename _Ditty, typename _GM_Constraints>
	void random_restart_em(GaussianMixture &gm, double &gm_density,
	                       _Ditty begin, _Ditty end, 
	                       const real_data_statistics &stats, const _GM_Constraints &constrain);

	/** Run EM to set model parameters	
		calls: em_itr
	*/
	template <typename _Ditty, typename _GM_Constraints>
	void em(GaussianMixture &gm, _Ditty begin, _Ditty end, 
	        const real_data_statistics &stats, const _GM_Constraints &constrain);

	/** Run one iteration of the EM algorithm to set the mixture parameters for the given set of data points */
	template <typename _Ditty, typename _GM_Constraints> // _Ditty = Double Iterator
	static void em_itr(GaussianMixture &gm, _Ditty begin, _Ditty end, int data_count,
	                   vector<vector<double> > &h, const _GM_Constraints &constrain);
	


  public:

	int min;				// min number-of-Gaussians
	int max;				// max number-of-Gaussians
	int folds;				// number of cross-fold-validation folds
	int em_iterations;
	int random_restarts;
	double pvalue;			// confidence p-value to convince that one 
							//	number-of-gaussians is better than another
							//	NOTE:  not all p-values are available
							//	(see TTest.h)
	
	GMGenerator(int min=1, int max=3, int folds = 10, int em_iterations = 20, 
	            int random_restarts = 6, double pvalue = 0.50) { 
		this->min = min;
		this->max = max;
		this->folds = folds;
		this->em_iterations = em_iterations;
		this->random_restarts = random_restarts;
		this->pvalue = pvalue;
	}
  

	/** Generate a GaussianMixture based on given data
		_Ditty:  Iterator through double data
		_GM_Constraints:  functor that contrains mixture
		calls:  cross_validation_cluster
	*/
	template <typename _Ditty, typename _GM_Constraints> 
  	GaussianMixture generate(_Ditty begin, _Ditty end, const _GM_Constraints &constrain);

	template <typename _Ditty> 
  	GaussianMixture generate(_Ditty begin, _Ditty end) { return generate(begin, end, GM_Noop()); }


}; // class 

struct GM_Noop : public unary_function<GaussianMixture, void> { 
	typedef GaussianMixture argument_type;
	typedef void result_type;
	void operator()(GaussianMixture &gm) const { return; }
};




/** divide data into several train/test folds */
template <typename T, typename _T_Iterator>
void GMGenerator::divide(_T_Iterator begin, _T_Iterator end, 
                         vector<vector<T> > &trainsets, vector<vector<T> > &testsets, int folds) {

	trainsets.resize(folds);
	testsets.resize(folds);

	_T_Iterator itty = begin;

	for (int p=0; itty != end;) { 

		for (int f=0; f<folds; f++) { 
			if ( p%folds != f ) {
				trainsets[f].push_back(*itty);
			} else {
				testsets[f].push_back(*itty);
			}
		}
		itty++;
		p++;
	}
	return;
}


/** get statistics about a set of doubles */
template <typename _Ditty> // Double Iterator
real_data_statistics GMGenerator::get_stats(_Ditty begin, _Ditty end) { 

	real_data_statistics ans;

	if (begin==end) { 
		ans.min = ans.max = ans.mean = ans.stdev = 0.0;
		ans.count = 0;
	} else {
		
		ans.min = ans.max = *begin;
		double sum = 0.0;
		ans.count = 0;

		for (_Ditty i = begin; i!=end; i++) { 

			if (*i < ans.min) { ans.min = *i; }
			if (*i > ans.max) { ans.max = *i; }
			sum += *i;
			ans.count++;
		}

		ans.mean = sum / ans.count;
		double sum_sq_diff = 0.0;

		for (_Ditty i = begin; i!=end; i++) {
			
			sum_sq_diff += ( *i - ans.mean ) * ( *i - ans.mean );
		}

		double variance = sum_sq_diff / ans.count;
		ans.stdev = sqrt(variance);

	}
	return ans;
} // get_stats

		
/** compare two mixtures on a set of data and keep the best one 
	(if compare is better, set keeper=compare)
	(if compare is better, set keeper_density = compare.log_density(...) 
*/
template <typename _Ditty> // Double Iterator
void GMGenerator::compare(GaussianMixture &keeper, GaussianMixture &compare, 
                                 _Ditty begin, _Ditty end) { 
	double keeper_density = keeper.log_density(begin, end);
	compare(keeper, compare, keeper_density, begin, end);
	return;
}

template <typename _Ditty> // Double Iterator
void GMGenerator::compare(GaussianMixture &keeper, GaussianMixture &compare, 
                                 double &keeper_density,
                                 _Ditty begin, _Ditty end) { 

	double p = compare.log_density(begin, end);
	if (p > keeper_density) { 
		keeper = compare;
		keeper_density = p;
	}
	return;
}
		
	



void GMGenerator::init_uniform(GaussianMixture &gm, const real_data_statistics &stats) { 

	// 
	//    /\    /\    /\
	// --------------------

	double width = stats.max - stats.min;	// width of data range
	int spacers = gm.size() + 1;			// number of inter-Gaussian spaces
	double dist = width / spacers;			// distance between Gaussians

	for (int i=0; i<gm.size(); i++) { 
		
		gm[i].mu = stats.min + ((i+1) * dist);		// uniform means
		gm[i].sigma = stats.stdev / gm.size();		// each with a share of the variance
		gm[i].w = 1.0 / gm.size();					// uniform weights

	}
	return;
}


void GMGenerator::init_random(GaussianMixture &gm, const real_data_statistics &stats) { 

	for (int i=0; i<gm.size(); i++) { 
		
		gm[i].mu = stats.min + rnd() * (stats.max-stats.min);
		gm[i].sigma = stats.stdev / gm.size();		// each with a share of the variance
		gm[i].w = rnd();

	}

	gm.normalize();	// weights should sum to 1.0
	return;

} // init_random
	

/** use cross-validation to cluster Gaussians in a mixture about a data set */
template <typename _Ditty, typename _GM_Constraints> // Double Iterator
vector<double> GMGenerator::cross_validation_cluster(GaussianMixture &gm, _Ditty begin, _Ditty end, 
                                                     const _GM_Constraints &constrain, 
													 vector<GaussianMixture> &trained_mixtures) {

	vector<double> test_results;	// return value

	// divide data into train/test sets
	vector<vector<double> > trainsets, testsets;
	divide(begin, end, trainsets, testsets, folds);

	for (int fold=0; fold<folds; fold++) {

		real_data_statistics stats = get_stats(trainsets[fold].begin(), trainsets[fold].end());
		
		// before random restarts, try a "uniform" distribution of Gaussian means
		init_uniform(gm, stats);
		em(gm, trainsets[fold].begin(), trainsets[fold].end(), stats, constrain);
		double gm_density = gm.log_density(trainsets[fold].begin(), trainsets[fold].end());

		// also try some random restarts looking for the best mixture
		random_restart_em(gm, gm_density, trainsets[fold].begin(), trainsets[fold].end(), stats, constrain);
		test_results.push_back(gm.log_density(testsets[fold].begin(), testsets[fold].end()));
		trained_mixtures.push_back(gm);

	}
	
	return test_results;

} // cross_validation_cluster

template <typename _Ditty, typename _GM_Constraints>
void GMGenerator::random_restart_em(GaussianMixture &gm, double &gm_density, 
                                    _Ditty begin, _Ditty end, 
                                    const real_data_statistics &stats, const _GM_Constraints &constrain) { 

	// record the probability of the given mixture (it might be better than all the restarts)
	GaussianMixture test_gm = gm;

	for (int rr=1; rr<random_restarts; rr++) { 
		
		// init randomly, reset mixture if it has a better probablity
		init_random(test_gm, stats);
		em(test_gm, begin, end, stats, constrain);

		compare(gm, test_gm, gm_density, begin, end);	// keep the better of gm, test_gm (store as gm)
		
	}

}


template <typename _Ditty, typename _GM_Constraints> // _Ditty = Double Iterator
void GMGenerator::em(GaussianMixture &gm, _Ditty begin, _Ditty end, 
                     const real_data_statistics &stats, const _GM_Constraints &constrain) {


	if (gm.size() == 1) { 

		// no point in running EM; just use statistics
		gm[0].mu = stats.mean;
		gm[0].sigma = stats.stdev;
		gm[0].w = 1.0;
		constrain(gm);	// technically should constrain this model, too

	} else {

		// create one "h" structure for all iterations
		vector<vector<double> > h(stats.count);
		for (int i=0; i<h.size(); i++) { h[i].resize(gm.size()); }
	
		for (int itr=0; itr<em_iterations; itr++) { 
	
			em_itr(gm, begin, end, stats.count, h, constrain);
	
		}
	}
	return;
}


/** Run one iteration of the EM algorithm to set the mixture parameters for the given set of data points */
template <typename _Ditty, typename _GM_Constraints> // _Ditty = Double Iterator
void GMGenerator::em_itr(GaussianMixture &gm, _Ditty begin, _Ditty end, int data_count, vector<vector<double> > &h, const _GM_Constraints &constrain) {

	// E-step:  Re-estimate expected cluster membership
	int i = 0;	// index to data in h
	for (_Ditty d=begin; d!=end; d++) { // for i in set of data points

		// calculate probability of point in each cluster
		double point_prob = 0.0;	// probability of this data point
		for (int j=0; j<gm.size(); j++) {
			h[i][j] = gm[j].density(*d);
			point_prob += h[i][j];	// add probability of being in this cluster to 
									//	total probability of this data point
		}
		// normalize over all gaussians
		for (int j=0; j<gm.size(); j++) {
			h[i][j] /= point_prob;
		}

		i++;	// next data point index
	}

	// M-step:  Re-estimate cluster parameters (means and variances)
	//	NOTE:  if any of the Gaussians go to zero weight (data_point_cnt = 0), this
	//	Gaussian has essentially been eliminated forever.  Set mean and variance to
	//	arbitrary real numbers to avoid a NAN infection.
	for (int j=0; j<gm.size(); j++) { 
		
		// calculate the mean value of this cluster
		double sum = 0.0;	// sum of all data points explained by this cluster
		double data_point_cnt = 0.0;	// number of data points in this cluster
		i = 0; // data point index
		for (_Ditty d=begin; d!=end; d++) { // for i in set of data points
			sum += h[i][j] * (*d);
			data_point_cnt += h[i][j];	// add this many data points (a fraction of one)
										//	to the total number for this Gaussian
			i++;
		}
		gm[j].mu = sum / data_point_cnt;

		// calculate the weight of this cluster
		gm[j].w = data_point_cnt / data_count;

		// calculate the variance of this cluster
		double sum_sq_diff = 0.0;	// sum of squared difference from mean
		i = 0;
		for (_Ditty d=begin; d!=end; d++) { 
			sum_sq_diff +=   h[i][j]
			               * (*d - gm[j].mu)
						   * (*d - gm[j].mu);
			i++;
		}
		gm[j].sigma = sqrt( sum_sq_diff / data_point_cnt );
		
		// clean up any NaN values (this is not an error; just no points close to a Gaussian--especially
		//							if user is putting extra constraints on the mixture model)
		if (data_point_cnt <= 0.0) { 
			gm[j].mu = 0.0; gm[j].sigma = 1.0; gm[j].w = 0.0; 
		}
		
	}

	// after the weights have been adjusted, need to re-normalize
	gm.normalize();
	
	// enforce any constraints on Gaussian Mixture Models
	constrain(gm);

	return;

} // em

template <typename _Ditty, typename _GM_Constraints> // _Ditty = Double Iterator
GaussianMixture GMGenerator::generate(_Ditty begin, _Ditty end, const _GM_Constraints &constrain) { 

	if (max - min + 1 <= 0) { return GaussianMixture(0); }	// no acceptable number of Gaussians

	vector<GaussianMixture> gms(max - min + 1);
	vector<vector<GaussianMixture> > trained_mixtures(max - min + 1);
	vector<vector<double> > test_results(max - min + 1);
	vector<real_data_statistics> test_results_stats(max - min + 1);

	for (int i=0; i<gms.size(); i++) { 
	
		gms[i].resize(min + i);
		test_results[i] = cross_validation_cluster(gms[i], begin, end, constrain, trained_mixtures[i]);
		test_results_stats[i] = get_stats(test_results[i].begin(), test_results[i].end());

	}

	int best_test_results_index = 0;
	vector<double> best_results = test_results[0];

	for (int i=1; i<test_results.size(); i++) { 

		double tvalue;

		if (test_results_stats[i].mean > test_results_stats[best_test_results_index].mean	// if these results are BETTER
			&&																				// AND
			TTest::twotailed(test_results[i].begin(), test_results[i].end(),				// statistically significant
		                     test_results[best_test_results_index].begin(),
							 pvalue, tvalue)) 
		{
			best_results = test_results[i];	// copy
			best_test_results_index = i;
		}
	}

	// retrain Mixture on ALL DATA 
	real_data_statistics stats = get_stats(begin, end);
	
	GaussianMixture best_gm = trained_mixtures[best_test_results_index][0];
	double best_density = best_gm.log_density(begin, end);
	
	// Try retraining mixtures that were trained on train' (train prime) set (keep the best on ALL data)
	for (int i=1; i<trained_mixtures[best_test_results_index].size(); i++) { 
		em(trained_mixtures[best_test_results_index][i], begin, end, stats, constrain);
		compare(best_gm, trained_mixtures[best_test_results_index][i], best_density, begin, end);
	}
	
	return best_gm;
	
} // generate	

#endif





