
#include "CPDTree.h"

#include <iomanip>	// setprecision for dot output
using namespace std;


// normalize a vector
void normalize_vector(vector<double> &vec) { 
	double sum = 0.0;
	for (int i=0; i<vec.size(); i++) { sum += vec[i]; }
	for (int i=0; i<vec.size(); i++) { vec[i] /= sum; }
	return;
}


/** create a CPDTree with the following variables */
CPDTree::CPDTree(const vector<const Variable*> &variables) {
	for (vector<const Variable*>::const_iterator i=variables.begin(); i!=variables.end(); i++) { 
		this->variables.push_back(*i);
	}
	
	root = new CPDTreeNode(variables.back()->arity());
	train_ratio_threshold = CPDTREE_DEFAULT_TRAIN_RATIO_THRESHOLD;
}

CPDTree::~CPDTree() { if (this->root) { delete this->root; } } 


/** delete the current tree and replace with this new one */
void CPDTree::replace(CPDTreeNode *new_root) { 
	if (this->root) { delete this->root; }
	this->root = new_root;
}


double CPDTree::score(const vector<double> &examples, const vector<vector<double> > &classes) const { 

	// examples and classes should be NORMALIZED

	double remainder = 0.0;
	for (int b=0; b<2; b++) { 
		double term_sum = 0.0;
		for (int c=0; c<variables.back()->arity(); c++) { 
			double term = -classes[b][c] * lg(classes[b][c]);
			if (isnan(term)) { term = 0.0; }
			term_sum += term;
		}
		remainder += examples[b] * term_sum;
	}
	return -remainder;
}

double CPDTree::LDL(const CPDTreeNode *tree) const { 

	// leaf description length:
	//	1 bit for "leaf"
	//	lg(C) bits for which category
	//  lg(C-1) bits for each alternate category * (C-1) alternate categories
	//	number-misclassified * lg(N) to identify each misclassified example

	const int C = tree->dist.size();	// number of child node's possible values

	const double XDL =   lg((double)(tree->num_examples_rem));
                       + lg((double)(C-1));

	double ans = 1.0 + lg((double)C);	// 1bit=LEAF, lg(C) bits for category

	for (int i=0; i<C; i++) { 
		double extent = tree->dist[i];
		double misclassified = (1.0 - extent) * tree->num_examples_rem;
		double message_length =   lg(tree->num_examples_rem)
		                        + (misclassified * XDL);
		if (message_length < 1.0) { message_length = 1.0; }	// can't have message length shorter than a bit
		ans += extent * message_length;								
	}

	if (ans < 0.0) { cout << "XXXX LDL says negative.  tree->num_examples_rem = " << tree->num_examples_rem<< "." << endl; }
	
	return ans;				 
}

/** Interior Node Description Length */
double CPDTree::IDL(const CPDTreeNode *tree) const { 

	// interior description length:
	//	1 bit for "interior"
	//  lg(num_features) bits for which feature
	//  lg(num_values) bits for which value
	
	double ans = 	 1.0
			       + lg((double)(tree->num_features_rem)) 
				   + lg((double)(tree->feature_arity_rem));

	if (ans < 0.0) { cout << "XXXX IDL says negative.  tree->num_features_rem = " << tree->num_features_rem << ", tree->feature_arity_rem = " << tree->feature_arity_rem << "." << endl; }

	return ans;
}	


/** Prune subtrees based on MDL */
void CPDTree::prune(double ratio_threshold) { 
	prune_recursive(root, ratio_threshold); 
}


/**
 * ratio_threshold is the MAXIMUM ratio of Leaf-Description-Length / Interior-Description-Length
 * that I'll prune the subtree.
 *	ratio LOW  <=> prune only if leaf cost is MUCH LOWER
 *  ratio 1.0  <=> prune if leaf cost <= interior cost
 *  ratio HIGH <=> prune unless leaf cost is MUCH HIGHER
 */
double CPDTree::prune_recursive(CPDTreeNode *tree, double ratio_threshold) {

	double ans;
	double ldl = LDL(tree);

	if (tree->leaf()) { 
	
		ans = ldl;	// just return leaf description length
		
	} else {
		
		double idl_subtotal = IDL(tree);
		double left_subtotal = prune_recursive(tree->left, ratio_threshold);
		double right_subtotal = prune_recursive(tree->right, ratio_threshold);

		double idl = idl_subtotal + left_subtotal + right_subtotal;

		// ratio of LDL / IDL, how much more costly is it for a Leaf than an Interior node?
		//	(LOW => probably want to prune this subtree)
		//  (HIGH => probably want to keep this an interior node)
		double ldl_idl_ratio = (ldl / idl); 

		if (ldl_idl_ratio <= ratio_threshold) {

			// ratio is SMALLER than minimum, best to PRUNE

			// make a leaf out of this tree
			delete tree->left;	tree->left=NULL;
			delete tree->right;	tree->right=NULL;
			
			ans = ldl;	// dl of this subtree is leaf DL
			
			// NOTE: all nodes have probability distributions, so this
			//	one is already set.

			// private values no longer used:
			tree->variable = NULL;
			tree->left_value = -1;
			
		} else {
			// not pruning, return cost as interior node
			ans = idl;	// DL of this subtree is interior DL
		}

	}
	return ans;
}


/** Train the network based on training data.
	The tree may have been told to prune during
	the training step */
void CPDTree::train(TrainingData *training_data) { 
	grow(training_data);
	prune(train_ratio_threshold);
}

/** grow the tree */
void CPDTree::grow(TrainingData *training_data) {
	
	if (root) { delete root;  root=NULL; }

	vector<bool> valid_features;
	vector<vector<bool> > valid_values(variables.size());
	
	for (int fid=0; fid<variables.size(); fid++) {
		valid_features.push_back(true);
		for (int vid=0; vid<variables[fid]->arity(); vid++) { 
			valid_values[fid].push_back(true);
		}

		if (variables[fid]->arity() == 2) {
			valid_values[fid][1] = false;
			// if there are only two possible values, there's
			//	no point in testing a split on both of them
			//	since they're the same (if you split on value
			//	0, values==0 go LEFT, values==1 go RIGHT, and
			//	if you split on 1, values==0 go RIGHT, values==1
			//	go LEFT, but that's the same split).
		}
	}

	vector<double> examples;
	for (int x=0; x<training_data->N; x++) { examples.push_back(1.0); }

	root = train_recursive(training_data, examples, valid_features, valid_values, variables);

	return;
}


/** private recursive version of train */
CPDTreeNode* CPDTree::train_recursive( TrainingData *training_data,
                                       const vector<double> &examples,
                                       vector<bool> &valid_features, 
				                       vector<vector<bool> > &valid_values, 
				                       const vector<const Variable*> &variables ) {

	const int F = variables.size() - 1;	// non-child variables 
	const int C = variables.back()->arity();	// number of values for child variable

	const int N = examples.size();	// shorthand:  number of examples

	// return value(s)
	CPDTreeNode *ans = new CPDTreeNode();
	ans->trainset_size = N;	// can set this now

	int split_var_index;
	int split_val_index;

	vector<double> example_count(2);	// two branches
	vector<vector<double> > class_count(2);
	class_count[0].resize(C);
	class_count[1].resize(C);

	double best_score = -INFINITY;

	vector<double> left_examples(N);
	vector<double> right_examples(N);
	vector<double> best_left_examples;
	vector<double> best_right_examples;

	// pre-calculate prob_dist (keep this for interior nodes, too, in case they're pruned)
	ans->num_examples_rem = calc_prob_dist(ans->dist, variables.back(), examples, training_data);

	bool homo = homogeneous(ans->dist.begin(), ans->dist.end());	
		// TODO:  may want to take the homogenaity check out.  It will
		//	never be true unless numbers are 100% pure

	for (int fid=0; !homo && fid<F; fid++) {
		// fid is INDEX in variables array (not actual variable ID)
		if (!valid_features[fid]) { continue; }
		for (int vid=0; vid<valid_values[fid].size(); vid++) { 
			if (!valid_values[fid][vid]) { continue; }

			// split on this feature value

			// zero counts
			for (int x=0; x<N; x++) { 
				left_examples[x] = right_examples[x] = 0.0;
			}
			for (int b=0; b<2; b++) { 
				example_count[b] = 0.0;
				for (int c=0; c<C; c++) { 
					class_count[b][c] = 0.0;
			}	}

			for (int x=0; x<N; x++) { 	// for each example

				// if this example doesn't effect anything, skip it
				if (examples[x] == 0.0) { continue; }

				double lb = (*training_data)(variables[fid]->ID, x, vid);
				double rb = 1.0 - lb;		// right branch

				left_examples[x] = lb * examples[x];
				right_examples[x] = rb * examples[x];

				example_count[0] += left_examples[x];
				example_count[1] += right_examples[x];
				
				for (int c=0; c<C; c++) { 

					double has_class = (*training_data)(variables.back()->ID, x, c);
						// returns the probability of class 'c', for variable ID, in example x
						
					////double has_class = xc(x,c);	// probability of class 'c'
					class_count[0][c] += has_class * left_examples[x];
					class_count[1][c] += has_class * right_examples[x];
				}
			}

			// normalize counts
			for (int b=0; b<2; b++) { normalize_vector(class_count[b]); }
			normalize_vector(example_count);

			// score this feature	
			double s = score(example_count, class_count);
			
			// check if all examples go down one branch
			int example_flow = 2 - 1;
			for (int b=0; b<2; b++) { 
				if (example_count[b] == 0.0) { example_flow--; }
			}
			
			// check if all splitted subsets are homogeneous
			vector<double> class_count_sum(C);
			for (int c=0; c<C; c++) {
				class_count_sum[c] = 0.0;
				for (int b=0; b<2; b++) { 
					class_count_sum[c] += class_count[b][c];
				}
			}
			int class_flow = C - 1;
			for (int c=0; c<C; c++) { 
				if (class_count_sum[c] == 0.0) { class_flow--; }
			}

			if (s > best_score && example_flow && class_flow) { 

				best_score = s;

				split_var_index = fid;
				split_val_index = vid;
				
				// tenatatively set answers members (if this remains the best split),
				//	 all this stuff is set here
				ans->variable = variables[fid];
				ans->left_value = vid;
				
				// and the private info
				ans->num_features_rem = count_flags(valid_features.begin(), 
				                                    valid_features.end());
				ans->feature_arity_rem = count_flags(valid_values[fid].begin(), 
					                                 valid_values[fid].end());

				best_left_examples = left_examples;	// copy
				best_right_examples = right_examples; // copy
			}
		} // next value
	} // next feature

	if (best_score == -INFINITY) { 

		// base case, no split, leaf here
		ans->left = ans->right = NULL;	// should already be the case.
		
		// ans->dist distribution and ans->num_examples_rem are already computed and normalized

		// the following info is not used by leaves, so best to set it to
		//	something conspicuous.
		ans->variable = NULL;
		ans->left_value = -1;

	} else { 
	
		// recursive case

		// since I'm using the SAME list of valid features and values for every 
		//	recursion, I'll pop the relevant ones off, grow my children, then
		//	push them back on.

		valid_features[split_var_index] = false;	// can't split on this again, 
		                                    // all examples have same value

		ans->left = train_recursive(training_data, best_left_examples,
									valid_features, valid_values, variables);

		valid_features[split_var_index] = true;

		valid_values[split_var_index][split_val_index] = false;
		int flag_count = count_flags(valid_values[split_var_index].begin(),
		                             valid_values[split_var_index].end());
		if (flag_count < 2) { valid_features[split_var_index] = false; }	
		
		ans->right = train_recursive(training_data, best_right_examples,
		                             valid_features, valid_values, variables);

		if (flag_count < 2) { valid_features[split_var_index] = true; }
		valid_values[split_var_index][split_val_index] = true;

		// all values of 'ans' are set, now	
	}

 	return ans;
} // train_recursive



	
void CPDTree::dot(ostream &out) const { 

	const int precision = -1;
	
	string title = "CPDTree for ";
	if (variables.size()) { title += variables.back()->name; }

	out << "digraph \"" << title << "\" {" << endl;
	if (title != "") { 
		out << "titlenode [color=\"white\" label=\"" << title << "\"];" << endl;
	}

	int next_id = 0;

	dot_recursive(root, out, next_id);

	out << "}" << endl;
	
	return;
}

int CPDTree::dot_recursive(CPDTreeNode *tree, ostream &out, int &nid) const { 

	const int INTERIOR_FONTSIZE = 16;
	const string INTERIOR_SHAPE = "ellipse";
	const int LEAF_FONTSIZE = 12;
	const string LEAF_SHAPE = "box";

	const int OSTREAM_PRECISION = 2;	// -1 <=> normal output, 2 <=> 2 digits precision

	int id = nid++;

	if (tree->leaf()) {

		out << "node" << id << " ["
		    << "shape=\"" << LEAF_SHAPE << "\", "
			<< "fontsize=\"" << LEAF_FONTSIZE << "\", "
			<< "label=\"";
		for (int c=0; c<tree->dist.size(); c++) { 
			out << setprecision(OSTREAM_PRECISION) << tree->dist[c];
			if (c < tree->dist.size()-1) { out << ", "; }
			out << setprecision(-1);	// undo the precision
		}
		out << "\\n(" << tree->num_examples_rem << ")\"];" << endl;
		
	} else { 

		out << "node" << id << " ["
		    << "shape=\"" << INTERIOR_SHAPE << "\", "
			<< "fontsize=\"" << INTERIOR_FONTSIZE << "\", "
		    << "label=\"" << tree->variable->name << "\"];" << endl;
		
		int leftid, rightid;
		leftid = dot_recursive(tree->left, out, nid);
		rightid = dot_recursive(tree->right, out, nid);
		out << "node" << id << " -> node" << leftid
		    << " [label=\"" << tree->variable->values[tree->left_value] << "\"];" << endl;
		out << "node" << id << " -> node" << rightid << ";" << endl;
	}

	return id;
}


/** print the tree in prefix */
void CPDTree::ascii(ostream &out) const { ascii_recursive(root, out, 0); }
void CPDTree::ascii_recursive(CPDTreeNode *tree, ostream &out, int indent) const { 
	
	const int INDENT = 4;
	const int OSTREAM_PRECISION = 2;	

	if (tree->leaf()) {
		
		for (int i=0; i<indent; i++) { out << " "; }
		out << "distribution over {";
		for (int v=0; v<variables.back()->arity(); v++) { 
			out << variables.back()->values[v];
			if (v < variables.back()->arity() - 1) { out << ", "; }
		}
		out << "} is [";
		for (int v=0; v<variables.back()->arity(); v++) { 
			out << setprecision(OSTREAM_PRECISION) << tree->dist[v] << setprecision(-1);
			if (v < variables.back()->arity() - 1) { out << ", "; }
		}
		out << "] (" << tree->num_examples_rem << ")" << endl;
	
	} else { 

		vector<const Variable*>::const_iterator v = variables.begin();
		while ((*v)->ID != tree->variable->ID) { v++; }
			
		for (int i=0; i<indent; i++) { out << " "; }
		out << (*v)->name << " == " << (*v)->values[tree->left_value] << endl;
		ascii_recursive(tree->left, out, indent + INDENT);
		
		for (int i=0; i<indent; i++) { out << " "; }
		out << (*v)->name << " != " << (*v)->values[tree->left_value] << endl;
		ascii_recursive(tree->right, out, indent + INDENT);

	}

	return;
}
	


/**
 * The big classify function
 */
distribution CPDTree::classify(ExampleData *data) const { 

	return classify_recursive(root, data, variables.back()->arity());

}


distribution CPDTree::classify_recursive(CPDTreeNode *tree, ExampleData *data, int distribution_size) const {

	distribution ans(distribution_size);

	if (tree->leaf()) {
		std::copy(tree->dist.begin(), tree->dist.end(), ans.begin());
	
	} else {
		
		double left_branch = (*data)(tree->variable->ID, tree->left_value);
		double right_branch = 1.0 - left_branch;

		distribution left_dist(distribution_size);
		if (left_branch > 0.0) { 	// only do this recursive call to examples that matter
			left_dist = classify_recursive(tree->left, data, distribution_size);
		}

		distribution right_dist(distribution_size);
		if (right_branch > 0.0) { 
			right_dist = classify_recursive(tree->right, data, distribution_size);
		}

		for (int c=0; c<distribution_size; c++) { 
			ans[c] =    left_branch * left_dist[c]
			          + right_branch * right_dist[c];
		}
	}
	return ans;
}

int CPDTree::size() const { return size_recursive(root); }

int CPDTree::size_recursive(CPDTreeNode *tree) const { 
	if (tree->leaf()) { return 1; }
	else { 
		return   size_recursive(tree->left)
		       + size_recursive(tree->right)
			   + 1;	// count this node, too
	}
}


/** 
 * Calculate the probability distribution of the remaining examples
 * return number of examples
 */
double CPDTree::calc_prob_dist(vector<double> &dist, 
                               const Variable *variable, 
							   const vector<double> &examples, 
                               TrainingData *training_data) {

	dist.resize(variable->arity());
	fill(dist.begin(), dist.end(), 0.0);	// zero it out first
	double num_examples = 0.0;
	for (int x=0; x<training_data->N; x++) { 
		num_examples += examples[x];
		for (int c=0; c<variable->arity(); c++) { 
			dist[c] += examples[x] * (*training_data)(variable->ID, x, c);
		}
	}

	normalize_vector(dist);

	return num_examples;
}

/** get a list of variables actually used in this tree (a subset of ALL variables) */
vector<const Variable*> CPDTree::get_variables() const {

	vector<const Variable*> incl_vars;
	mark_variables(root, incl_vars);

	// add child variable
	incl_vars.push_back(variables.back());
	
	return incl_vars;
}

/** Does the tree include ID? */
bool CPDTree::has(int ID) const { 

	if (ID == variables.back()->ID) { return true; }
	return has_recursive(this->root, ID);
}

/** Does the tree include ID? */
bool CPDTree::has_recursive(const CPDTreeNode *tree, int ID) const { 

	if (tree->leaf()) { 
		return (ID==variables.back()->ID);
	} else {
		if (ID == tree->variable->ID) { return true; }
		if (has_recursive(tree->left, ID)) { return true; }
		return has_recursive(tree->right, ID);
	}
}
	

/** return a *new* CPT cooresponding to the tree */
CPT* CPDTree::as_table() const { 

	// see which variables are included in the tree (PASS #1)
	//	NOTE:  That is, we only include potential parent variables
	//			that actually appear in the tree when we make a table.
	vector<const Variable*> incl_vars = get_variables();

	unsigned long size = 1;
	for (vector<const Variable*>::const_iterator v=incl_vars.begin(); v!=incl_vars.end(); v++) { 
		size *= (*v)->arity();
	}
	if (size > 1236722255) {

		////cerr << "WARNING (CPDTree::as_table):  pruning tree (of size "
		////	 << this->size() << ") before converting to a table of size "
		////	 << size << "... ";

		CPDTree *new_tree = this->copy();
		new_tree->prune(1.0);
		////cerr << "pruned tree size is " << new_tree->size() << "... ";
		
		CPT *table = new_tree->as_table();
		////cerr << "table size is " << table->total_size() << ".";
		
		delete new_tree;
		
		return table;
	}
	
	// create a table with only those variables
	CPT *table = new CPT(incl_vars);	
	
	// populate table with distributions (PASS #2)
	as_table_recursive(root, table, 0, table->total_size(), 0);

	return table;

} 

CPDTree* CPDTree::copy() const { 

	CPDTree *ans = new CPDTree(*this);
	ans->root = copy_recursive(this->root);
	return ans;
}

CPDTreeNode* CPDTree::copy_recursive(const CPDTreeNode *src) const { 

	CPDTreeNode *ans = new CPDTreeNode(*src);
	ans->variable = src->variable;
	ans->left_value = src->left_value;
	ans->left = NULL;
	ans->right = NULL;
	ans->num_features_rem = src->num_features_rem;
	ans->feature_arity_rem = src->feature_arity_rem;
	ans->trainset_size = src->trainset_size;	
	ans->num_examples_rem = src->num_examples_rem;
		
	if (!src->leaf()) { 
		ans->left = copy_recursive(src->left);
		ans->right = copy_recursive(src->right);
	}
	return ans;
}
		

/**
 * Algorithm for as_table_recursive, which fills in the data of a CPT
 *	according to the distributions in a tree
 *
 * Recall that the LEFT branch cooresponds to a certain variable
 *	value, and the RIGHT branch cooresponds to every other value.
 * The data in the table is in row-major-order with the child variable being
 *	the last (least significant) in the ordering.  I can create a "pattern"
 *	which tells me which of these table entries are applicable given a path
 *	down the tree.  For instance, if there are two binary parents, A and B,
 *	and I call the (binary) child variable X, the table data is:
 *
 *A:        0   |   1      A has 2 values, offset is 4
 *B:      0 | 1 | 0 | 1    B has 2 values, offset is 2
 *X:     0|1|0|1|0|1|0|1   X has 2 values, offset is 1
 *       -|-|-|-|-|-|-|-
 *data:  0 1 2 3|4 5 6 7
 *
 * The pattern cooresponding to B=1 would be (start=2, step=2, skip=4), 
 *	which means start at data index 2, write 2 numbers, then go to 4 positions
 *	after the last place you started.
 * The pattern cooresponding to A=0 would be (start=0, step=4, skip=8).
 * In general, the pattern is (value*offset, offset, offset*num_values).
 * Two patterns are combined with (SUM, MIN, MAX).  So, the pattern for
 *	B=1, A=0 would be (start=2, step=2, skip=8), which fills in data[2] and data[3]
 *	as it should.
 *
 * The algorithm here is to start off with the pattern (start=0, step=8, skip=8),
 *	that is, fill in everything, the go down the RIGHT branches and fill in the 
 *	data according to the pattern, then add in the pattern for the (variable,value)
 *	then go down the LEFT branch.
 *
 * For example, the tree
 *
 *              A
 *            0/ \
 *           B     C
 *         0/ \  0/ \
 *         0   1 1  0
 *
 *	we first go down the right branch (A!=0, C!=0), 
 *	make the whole table X=0, go up to node "C",
 *	set the pattern to C==0, go down the left branch (A!=0,C==0)
 *	set the data to X=1 where the pattern C==0 applies,
 *	go up to the root, set the pattern to A==0, go to the node "B",
 *	go down the right branch, set everything where A==0 to X=1,
 *	go back up to "B", set the pattern to A==0 AND B==0, then go 
 *	down the left branch.  To clarify, we will overwrite some data
 *	when we go down each LEFT branch.
 *
 */
void CPDTree::as_table_recursive(CPDTreeNode *tree, CPT *table, int start, int step, int skip) const { 


	if (tree->leaf()) { 
		// set distributions
		for (double *p = table->begin() + start; p < table->end(); p += skip) {
			for (int M=0; M<step; M++) { 
				p[M] = tree->dist[M%(tree->dist.size())]; 
			}
			if (skip==0) { break; }	// NOTE:  skip==0 is a special case (skip==table_size
				// is more natural for setting ALL values, but when you go down a LEFT 
				// branch, skip gets set to the MAX, so if skip==table_size, not all the
				//	values would get set)
		}
	
	} else {
		// go right
		as_table_recursive(tree->right, table, start, step, skip);

		// combine pattern
		int offset = table->var_offset(tree->variable->ID);
		start += (tree->left_value * offset);			// SUM
		step = (step < offset) ? step : offset;		// MIN
		int t_skip = offset * tree->variable->arity();
		skip = (skip > t_skip) ? skip : t_skip;		// MAX

		// go left
		as_table_recursive(tree->left, table, start, step, skip);
	}

	return;
}



/** Collect all variables (besides the child node) mentioned in the tree */
void CPDTree::mark_variables(CPDTreeNode *tree, vector<const Variable*> &marked) const { 

	if (tree->leaf()) { return; }

	// interior node 
	bool my_var_marked = false;
	for (int i=0; i<marked.size(); i++) { 
		if (marked[i]->ID == tree->variable->ID) { my_var_marked = true; break; }
	}
	if (!my_var_marked) { marked.push_back(tree->variable); }	
	
	if (marked.size()==variables.size()) { return; }	// all are marked
	mark_variables(tree->left, marked);
	if (marked.size()==variables.size()) { return; }	// all are marked
	mark_variables(tree->right, marked);

	return;
}


