import java.util.*; import java.io.*; //////////////////////////////////////////////////////////////////////////// // // // Code for HW1, Problem 2 - Inducing Decision Trees, CS540, Spring 2008. // // // //////////////////////////////////////////////////////////////////////////// /* BuildAndTestDecisionTree.java Copyright 2008, 2011 by Jude Shavlik. May be freely used for non-profit educational purposes. To run after compiling, type: java BuildAndTestDecisionTree Eg, java BuildAndTestDecisionTree train-house-votes-1984.data test-house-votes-1984.data where and are the input files of examples. Notes: you may separate these classes into individual files if you wish. We've put everything in one file for your convenience in getting started. All that is required is that you keep the name of the BuildAndTestDecisionTree class and don't change the calling convention for its main function. There is no need to worry about "error detection" when reading data files. We'll be responsible for that. HOWEVER, DO BE AWARE THAT WE WILL USE ONE OR MORE DIFFERENT DATASETS DURING TESTING, SO DON'T WRITE CODE THAT IS SPECIFIC TO THE "VOTES" DATASET. (As stated above, you may assume that our additional datasets are properly formatted in the style used for the votes data.) A weakness of our design is that the category and feature names are defined in BOTH the train and test files. These names MUST match, though this isn't checked. However, we'll live with the weakness because it reduces simplicity overall (note: you can use the SAME filename for both the train and the test set, as a debugging method; you should get ALL the test examples correct in this case, since we are not "pruning" decision trees to avoid overfitting the training data - but be sure you understand Problem 1's method for pruning decision trees). */ public class BuildAndTestDecisionTree { // "Main" reads in the names of the files we want to use, then reads in their examples. public static void main(String[] args) { if (args.length != 2) { System.out.println("You must call BuildAndTestDecisionTree as follows:\n\n" + " java BuildAndTestDecisionTree "); } // Read in the file names. String trainset = args[0]; String testset = args[1]; // Read in the examples from the files. ListOfExamples trainExamples = new ListOfExamples(trainset); ListOfExamples testExamples = new ListOfExamples(testset); if (!trainExamples.ReadInExamplesFromFile(trainset) || !testExamples.ReadInExamplesFromFile(testset)) { System.out.println("\nSomething went wrong reading the datasets ... giving up."); } else { // The following is included so you can see the data organization. // You'll need to REPLACE it with code that: // // 1) uses the TRAINING SET of examples to build a decision tree // // 2) prints out the induced decision tree (using simple, indented ASCII text) // // 3) categorizes the TESTING SET using the induced tree, reporting // which examples were INCORRECTLY classified, as well as the // FRACTION that were incorrectly classified. // Just print out the NAMES of the examples incorrectly classified // (though during debugging you might wish to print out the full // example to see if it was processed correctly by your decision tree) trainExamples.DescribeDataset(); testExamples.DescribeDataset(); trainExamples.PrintThisExample(0); // Print out an example //trainExamples.PrintAllExamples(); // Don't waste paper printing all of this out! //testExamples.PrintAllExamples(); // Instead, just look at it on the screen. } Utilities.waitHere("Hit when ready to exit."); } } // This Class, an extension of Vector, holds an individual example. // The new method PrintFeatures() can be used to // display the contents of the example. class Example extends Vector { public String name, category; // The name and the category of this example. // The items in the vector are the feature values. public ListOfExamples parent; // The data set in which this is one example. // The instance constructor. public Example(int numberOfFeatures) { super(numberOfFeatures); } // Print out this example in human-readable form. public void PrintFeatures() { System.out.print("Example " + name + ", category = " + category + "\n"); for (int i = 0; i < parent.numberOfFeatures; i++) { System.out.print(" " + parent.featureNames[i] + " = " + elementAt(i) + "\n"); } } } // A simple class that holds a pair of strings // Since each feature is Boolean, we use a ValuePair to store both of its possible values class ValuePair { public String firstValue; public String secondValue; public ValuePair(String first, String second) { firstValue = first; secondValue = second; } } // This Class holds all of our examples from one dataset // (train OR test, not BOTH). It extends the Vector class. // Be sure you're not confused. We're using TWO vectors. An Example // is a vector of feature values, while a ListOfExamples is a vector of examples. // Also, there is one ListOfExamples for the TRAINING SET and one for the TESTING SET. class ListOfExamples extends Vector { public String[] featureNames; // The names of the features used to describe examples. public ValuePair[] featureValues; // A list of the possible values for each of the features. public ValuePair categories; // The names of the two categories. public String nameOfDataset; // Assign a name, for use in printing info. public int numberOfFeatures; // For future use in classification it is helpful // to know how many features we have. public int numberOfExamples; // Number of examples in this dataset. // The instance constructor. public ListOfExamples(String name) { nameOfDataset = name; // Hold on to the name of the data set. } public void DescribeDataset() { System.out.println("Dataset " + nameOfDataset + " contains " + numberOfExamples + " examples, each with " + numberOfFeatures + " features."); System.out.println("Valid category labels: " + categories.firstValue + " and " + categories.secondValue); System.out.println("The feature names (with their possible values) are:"); for (int i = 0; i < numberOfFeatures; i++) { System.out.println(" " + featureNames[i] + " (" + featureValues[i].firstValue + " or " + featureValues[i].secondValue + ")"); } } // Print out ALL the examples. public void PrintAllExamples() { System.out.println("List of Examples"); for (int i = 0; i < size(); i++) { Example thisExample = (Example)elementAt(i); thisExample.PrintFeatures(); } } // Print out the SPECIFIED example. public void PrintThisExample(int i) { Example thisExample = (Example)elementAt(i); thisExample.PrintFeatures(); } // Read this example file from disk. // You needn't understand this method. We're taking // responsibility of getting data out of a file and // into the ListOfExamples and Example instances. public boolean ReadInExamplesFromFile(String filename) { try { FileReader inputFile = null; try { inputFile = new FileReader(filename); } catch (IOException ioe) { System.out.println("Error opening file: (" + ioe + ")"); return false; } StreamTokenizer fileTokens = new StreamTokenizer(inputFile); fileTokens.lowerCaseMode(true); // Ignore case. fileTokens.commentChar('/'); // Everything on a line after '/' is ignored. // Read in the names of the two possible categories String firstValue = readNextWord(fileTokens); String secondValue = readNextWord(fileTokens); categories = new ValuePair(firstValue, secondValue); numberOfFeatures = readNextInteger(fileTokens); // Build a vector of all the feature names. featureNames = new String[numberOfFeatures]; featureValues = new ValuePair[numberOfFeatures]; for (int i = 0; i < numberOfFeatures; i++) { featureNames[i] = readNextWord(fileTokens); firstValue = readNextWord(fileTokens); secondValue = readNextWord(fileTokens); featureValues[i] = new ValuePair(firstValue, secondValue); } // Read in the examples. numberOfExamples = readNextInteger(fileTokens); for (int i = 0; i < numberOfExamples; i++) { Example example = new Example(numberOfFeatures); // Create and fill in an example instance. example.parent = this; // Provide a "back" pointer. example.name = readNextWord(fileTokens); example.category = readNextWord(fileTokens); for (int j = 0; j < numberOfFeatures; j++) { String featureValue = readNextWord(fileTokens); // You may assume that the only feature values ever used // are the ones specified in the header of the file if (featureValue == null || (!featureValue.equalsIgnoreCase(featureValues[j].firstValue) && !featureValue.equalsIgnoreCase(featureValues[j].secondValue))) { System.out.println("Read " + featureValue + " from " + filename + " when expecting \"" + featureValues[j].firstValue + "\" or \"" + featureValues[j].secondValue + "\"." + " Ex #" + i + ", feature #" + j); return false; } example.addElement(featureValue); } addElement(example); // Add to the list of examples. } return true; // Indicate success } // Don't try to do all kinds of fancy error dectection and correction. catch (Exception e) { System.out.println("Error in ReadInExamples - check " + filename + "\n msg=" + e); return false; } } // Read the next word in this stream. private String readNextWord(StreamTokenizer st) { try { switch (st.nextToken()) { case StreamTokenizer.TT_WORD: return st.sval; default: System.out.println("Expecting a string in readNextWord()."); return null; } } catch (IOException ioe) { System.out.println("I/O Exception? " + ioe); return null; } } // Read the next number in this stream. private int readNextInteger(StreamTokenizer st) { try { switch (st.nextToken()) { case StreamTokenizer.TT_NUMBER: return (int)st.nval; default: System.out.println("Expecting an integer in readNextInteger()."); return -1; } } catch (IOException ioe) { System.out.println("I/O Exception? " + ioe); return -1; } } } class Utilities { // This method can be used to wait until you're ready to proceed. public static void waitHere(String msg) { System.out.println(""); System.out.print(msg); try { System.in.read(); } catch(Exception e) {} // Ignore any errors while reading. } }