com.sjm.machlearn.dataset
Class DataSet

java.lang.Object
  |
  +--com.sjm.machlearn.util.MainClass
        |
        +--com.sjm.machlearn.dataset.DataSet

public class DataSet
extends MainClass

DataSet.java : class that holds a whole data set of examples together.


Field Summary
protected  Example[] data
           
protected  FeatureIdList idlist
           
protected  double[] weights
           
 
Fields inherited from class com.sjm.machlearn.util.MainClass
debug, debug_level, debug_listeners
 
Constructor Summary
DataSet(DataSet ds)
           
DataSet(Example[] ex)
           
DataSet(Feature[][] featurematrix)
           
DataSet(FeatureIdList fid)
           
DataSet(FeatureIdList idl, Example[] dt)
           
DataSet(java.lang.String namesfile, java.lang.String datafile)
           
 
Method Summary
 void copyWeights(DataSet ds)
           
 void copyWeights(double[] wgts)
           
 int countOutput(java.lang.String value)
           
 int[] createMapping(boolean randomize)
           
 Example get(int index)
           
 DataSet[] getBootStrapReplicate()
           
 double getEntropy()
           
 double getEntropyWgt()
           
 FeatureId getFeatureId(int index)
           
 int getFeatureIndex(FeatureId featureid)
           
 FeatureIdList getIdList()
           
 int getMajorityOutputIndex()
           
 int getMajorityOutputIndexWgt()
           
 int[] getOutputCounts()
           
 double[] getOutputCountsWgt()
           
 Feature getOutputFeature()
           
 Feature getOutputFeature(int example_index)
           
 FeatureId getOutputFeatureId()
           
 int getOutputIndex()
           
 int[] getOutputValueIds()
           
 double getTotalWeights()
           
 double getWeight(int index)
           
 boolean hasFeature(FeatureId featureid)
           
 void initializeWeights()
           
 void merge(DataSet newSet)
           
 void normalizeWeights()
           
 int numFeatures()
           
 java.lang.String printFeatureIdList()
           
protected  void readData(java.lang.String datafile)
           
 boolean sameOutputValue()
           
 void setWeight(int index, double val)
           
 int size()
           
 DataSet[] splitDataSet(int numSets, boolean randomize)
           
 DataSet[][] splitDataSetFolds(int numfolds, boolean randomize)
          splitDataSetFolds() : splits the dataset into an array of n-folds.
 DataSet[] splitJackKnife(int index)
           
 DataSet[] splitRandom(double testpercent)
          splitRandom() : splits the data into two subsets, based upon the testpercent.
 void write(java.lang.String namesfile, java.lang.String datafile)
           
 void writeData(java.lang.String datafile)
           
 void writeNames(java.lang.String namesfile)
           
 
Methods inherited from class com.sjm.machlearn.util.MainClass
_internalError, _internalError, addDebugListener, debugMesg, debugMesg, debugMesg, debugMesg, debugMesg, debugMesg, debugOff, debugOn, internalError, internalError, setDebug, setDebugLevel
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

idlist

protected FeatureIdList idlist

data

protected Example[] data

weights

protected double[] weights
Constructor Detail

DataSet

public DataSet(DataSet ds)

DataSet

public DataSet(Example[] ex)

DataSet

public DataSet(FeatureIdList fid)

DataSet

public DataSet(Feature[][] featurematrix)
        throws InvalidFeature,
               NoOutputException,
               MultipleOutputException

DataSet

public DataSet(FeatureIdList idl,
               Example[] dt)

DataSet

public DataSet(java.lang.String namesfile,
               java.lang.String datafile)
        throws java.lang.Exception
Method Detail

copyWeights

public void copyWeights(double[] wgts)

copyWeights

public void copyWeights(DataSet ds)

initializeWeights

public void initializeWeights()

getIdList

public FeatureIdList getIdList()

getFeatureId

public FeatureId getFeatureId(int index)

getOutputValueIds

public int[] getOutputValueIds()

hasFeature

public boolean hasFeature(FeatureId featureid)

getFeatureIndex

public int getFeatureIndex(FeatureId featureid)
                    throws MissingFeatureException

get

public Example get(int index)

getOutputIndex

public int getOutputIndex()

getWeight

public double getWeight(int index)

setWeight

public void setWeight(int index,
                      double val)

getTotalWeights

public double getTotalWeights()

normalizeWeights

public void normalizeWeights()

sameOutputValue

public boolean sameOutputValue()

getMajorityOutputIndex

public int getMajorityOutputIndex()

getMajorityOutputIndexWgt

public int getMajorityOutputIndexWgt()

getOutputFeature

public Feature getOutputFeature()

getOutputFeatureId

public FeatureId getOutputFeatureId()

getOutputFeature

public Feature getOutputFeature(int example_index)

getOutputCounts

public int[] getOutputCounts()

getOutputCountsWgt

public double[] getOutputCountsWgt()

getEntropyWgt

public double getEntropyWgt()

getEntropy

public double getEntropy()

size

public int size()

numFeatures

public int numFeatures()

countOutput

public int countOutput(java.lang.String value)
                throws InvalidFeature

writeNames

public void writeNames(java.lang.String namesfile)
                throws java.lang.Exception

writeData

public void writeData(java.lang.String datafile)
               throws java.lang.Exception

write

public void write(java.lang.String namesfile,
                  java.lang.String datafile)
           throws java.lang.Exception

readData

protected void readData(java.lang.String datafile)
                 throws java.lang.Exception

merge

public void merge(DataSet newSet)

splitRandom

public DataSet[] splitRandom(double testpercent)
splitRandom() : splits the data into two subsets, based upon the testpercent. examples are chosen randomly from this dataset. DataSet[0] = smaller part of the dataset. DataSet[1] = rest of the dataset.

splitJackKnife

public DataSet[] splitJackKnife(int index)

splitDataSetFolds

public DataSet[][] splitDataSetFolds(int numfolds,
                                     boolean randomize)
splitDataSetFolds() : splits the dataset into an array of n-folds. 1st index is the fold number. 2nd index is 0=trainset 1=testset

splitDataSet

public DataSet[] splitDataSet(int numSets,
                              boolean randomize)

getBootStrapReplicate

public DataSet[] getBootStrapReplicate()

createMapping

public int[] createMapping(boolean randomize)

printFeatureIdList

public java.lang.String printFeatureIdList()