/*
	A Conditional Probability Distribution Tree.

	------

	This tree is designed to be used when the training data's 
		feature values are not necessarily known, but rather
		a probability distribution is assigned.  Also, the 
		classification of each example is not known, but
		a probability is assigned to it, too.

	Information gain is used to grow the tree.  MDL is used to 
		prune it afterwards.

	------
	
	CPDTree is a BINARY tree, where each interior node specifies
	a variable and a value.  All examples for which the given
	variable has the given value go down the LEFT path.  All 
	examples that have some other value for the given variable
	go down the RIGHT path.  
	
	The leaves of the tree have a probability distribution over
	the possible values of the child variable (this is because
	the tree is BINARY and the child node can have any number
	of possible values--otherwise leaves would just be unnormalized
	probabilities)
	
	------

	CPDTree is a subclass of CPD (see CPD.h)

	CPDTree has a pointer to a CPDTreeNode, which has as variable ID,
	a value ID for the variable (it's position in the Variable::values
	vector) and right and left pointers to CPDTreeNode subtrees.

*/

#ifndef CPDTree_H
#define CPDTree_H 1

#include <iostream>
#include <vector>
#include <functional>
#include <string>
#include <sstream>
#include <math.h>		// INFINITY
#include <stdlib.h>
using namespace std;

#include "probability.h"	// Variable
#include "CPD.h"			// CPD, TrainingData
#include "CPT.h"			// CPT* as_table()

#ifndef INFINITY	// should be defined in math.h, 
                    //	but not on my Solaris machine...
const double INFINITY = -log(0.0);
#endif

#ifndef lg
#define log_base_e_of_2 0.69314718055994528622676398299518041312694549560546875
#define lg(n) (log(n)/log_base_e_of_2)
#endif


/** 
 *	Define the RATIO of leaf description length
 *	to interior node description length under which the tree should
 *	be pruned.  If the ratio is 1.0, that means that trees should
 *	be pruned iff the cost (description length) of the LEAF is 
 *	SMALLER than the cost (description length) of the subtree.
 *
 *	If the ratio is 2.0, then subtrees will be 
 *	pruned just in case the cost (description length) of a leaf node
 *	is UP TO TWICE the cost (description length) of the subtree
 *
 *	In other words, higher values represent an inductive bias that the
 *	tree should be smaller, lower values represent an inductive bias that
 *	the tree should be bigger, and better represent the training data
 *
 *	This should be set to something >= 1.0, close to 1.0.  Values like
 *	1.1 or 1.2 are reasonable, and will prune trees that have "pretty-much"
 *	the same distribution over variable values for both left and right 
 *	subtrees.
 *
 *	This is the default ratio used when pruning the tree during a call to train()
 *
 */
#define CPDTREE_DEFAULT_TRAIN_RATIO_THRESHOLD 0.8

class CPDTreeNode;
class CPDTree;

class CPDTreeNode { 

  friend class CPDTree;

  public:

	// data for interior nodes
  	const Variable *variable;	// split variable (for interior nodes)
	int left_value;				// variable value that goes down the LEFT branch
	CPDTreeNode *left, *right;

	// data for leaf nodes
	distribution dist;	// probability distribution over variable values
  
	inline bool leaf() const { return left==NULL; }

	/** Default:  Set as LEAF with no distribution */
	CPDTreeNode() : variable(NULL), left_value(-1), left(NULL), right(NULL),
		num_features_rem(0), feature_arity_rem(0), trainset_size(0), num_examples_rem(0.0) { }
	
	/** Create a LEAF with a uniform probability distribution */
	CPDTreeNode(int v) : variable(NULL), left_value(-1), left(NULL), right(NULL),
		num_features_rem(0), feature_arity_rem(0), trainset_size(0), num_examples_rem(0.0) {
		for (int i=0; i<v; i++) { dist.push_back(1.0/v); } }

	/** Create an INTERIOR node */
	CPDTreeNode(const Variable *var, int val, CPDTreeNode *L, CPDTreeNode *R)
		: variable(var), left_value(val), left(L), right(R),
		num_features_rem(0), feature_arity_rem(0), trainset_size(0), num_examples_rem(0.0)  { }

	~CPDTreeNode() { if (left) {delete left;} if (right) {delete right;} }

  private:

	// information about the training data (for calculating description length, etc.)
	int num_features_rem;	// REMAINING number of features possible (used for pruning)
	int feature_arity_rem;	// REMAINING feature arity
	int trainset_size;		// total size of training data 
	double num_examples_rem;// how many examples contributed to prob dist

};


class CPDTree : public CPD { 
	// inheirits vector<Variable*> variables

  private:

  	CPDTreeNode *root;
  	
  public:

  	double train_ratio_threshold;

	/** create a CPDTree with the following variables 
		initializes to a single leaf with uniform distribution
	 */
	template <typename _Variable_Iterator>
  	CPDTree(_Variable_Iterator begin, _Variable_Iterator end);

	/** create a CPDTree with the following variables
		initializes to a single leaf with uniform distribution
	*/
	CPDTree(const vector<const Variable*> &variables);
	
	~CPDTree();

	/** CPD members ----------- */

	/** Convert the tree to a *new* CPT table */
	virtual CPT* as_table() const;

	/** Grow the tree */
	virtual void train(TrainingData*);

	/** Grow the tree */
	void grow(TrainingData*);

	/** Classify an example */
	virtual distribution classify(ExampleData*) const;

	/** Get a list of variables employed by the tree
	 *  (a subset of CPD::varaibles)
	 */
	virtual vector<const Variable*> get_variables() const;

	/** Similar question, is a variable included? */
	virtual bool has(int ID) const; 

	/** Other public members -----*/

	/** DELETE (!) the current tree and replace with this new one */
	void replace(CPDTreeNode *new_root);

	/** Prune the tree based on MDL 
	 *	(see CPDTREE_LDL_IDL_MAXIMUM_RATIO_THRESHOLD above)
	 */
	void prune(double ratio_threshold=1.0);

	/** Produce "dot" markup representing the tree */
	void dot(ostream &out) const; // overrides CPD

	/** Print the tree in ASCII format (it's prefix) */
	void ascii(ostream &out) const; // overrides CPD

	/** count the nodes in a tree */
	int size() const;

	/** copy all data */
	CPDTree* copy() const;

  private:
 
	/** private version of "train", grow (recursive) */
	CPDTreeNode* train_recursive( TrainingData *training_data,
						const vector<double> &examples,
						vector<bool> &valid_features, 
						vector<vector<bool> > &valid_values, 
						const vector<const Variable*> &variables );

	/** private version of "classify" (recursive) */
	distribution classify_recursive(CPDTreeNode *tree, ExampleData *data, 
	                                int dist_size) const;

	/** recursive prune function */
	double prune_recursive(CPDTreeNode *tree, double ratio_threshold);
			 
	/** private version of dot (return dot node ID of subtree) */
	int dot_recursive(CPDTreeNode *tree, ostream &out, int &next_node_id) const;

	/** private version of ascii (print node in prefix) */
	void ascii_recursive(CPDTreeNode *tree, ostream &out, int indent) const;
	
	/** count the nodes in a tree */
	int size_recursive(CPDTreeNode*) const;

	/** private version of as_table, build up a CPT */
	void as_table_recursive(CPDTreeNode*, CPT*, int,int,int) const;

	/** private version of has, depth-first, left-first search for ID */
	bool has_recursive(const CPDTreeNode *tree, int ID) const;

	/** private version of copy, allcate new nodes */
	CPDTreeNode* copy_recursive(const CPDTreeNode*) const;
	
	/** personal scoring function, see implementation for details */
	double score(const vector<double> &examples,  
	             const vector<vector<double> > &classes) const;

	/** is a set of counts homogeneous (all zero except one)? */
	template<typename _Iterator>
	bool homogeneous(_Iterator begin, _Iterator end) const;

	/** count how many 'true' in a list of booleans */
	template <typename _Bool_Iterator>
	int count_flags(_Bool_Iterator begin, _Bool_Iterator end) const;

	/** add everything in a collection of numbers */
	template <typename N, typename _Iterator>
	N sum(_Iterator begin, _Iterator end) const;

	/** description length as leaf, interior node */
	double LDL(const CPDTreeNode*) const;
	double IDL(const CPDTreeNode*) const;

	/** calculate probability distribution and number of examples in a current
		training set
		return:  number of examples (sum over argument examples)
	*/
	double calc_prob_dist(vector<double> &dist, 
                          const Variable *variable, 
						  const vector<double> &examples, 
                          TrainingData *training_data);
	
	/** collect all variable IDs that are used in the tree */
	void CPDTree::mark_variables(CPDTreeNode *tree, vector<const Variable*> &marked) const;


};	// CPDTree class


/* -------- IMPLEMENTATION OF TEMPLATE FUNCTIONS ------------- */

/** create a CPDTree with the following variables */
template <typename _Variable_Iterator>
CPDTree::CPDTree(_Variable_Iterator begin, _Variable_Iterator end) {
	for (_Variable_Iterator v=begin; v!=end; v++) { 
		this->variables.push_back(&(*v));	// copy variable onto list
	}
	
	root = new CPDTreeNode(variables.back()->arity());
	train_ratio_threshold = CPDTREE_DEFAULT_TRAIN_RATIO_THRESHOLD;
}


template<typename _Iterator>
bool CPDTree::homogeneous(_Iterator begin, _Iterator end) const { 
	int nonzero = 0;	// no non-zero values yet
	for (_Iterator i=begin; i!=end; i++) { 
		if ( *i != 0.0 ) { nonzero++; }
	}
	return (nonzero <= 1);
}
	
/** count the number of flags in a vector */
template <typename _Bool_Iterator>
int CPDTree::count_flags(_Bool_Iterator begin, _Bool_Iterator end) const { 
	int ans = 0;
	for (_Bool_Iterator i=begin; i!=end; i++) { 
		if (*i) { ans++; }
	}
	return ans;
}

template <typename N, typename _Iterator>
N CPDTree::sum(_Iterator begin, _Iterator end) const { 
	N ans = (N)0;
	for (_Iterator i=begin; i!=end; i++) { 
		ans += *i;
	}
	return ans;
}



#endif	
