00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00043 #ifndef DATA_DATASET_H
00044 #define DATA_DATASET_H
00045 
00046 #include "fastlib/col/col_string.h"
00047 #include "fastlib/la/matrix.h"
00048 #include "fastlib/math/discrete.h"
00049 #include "fastlib/file/textfile.h"
00050 
00051 class TextLineReader;
00052 class TextWriter;
00053 
00059 class DatasetFeature {
00060  public:
00064   enum Type {
00066       CONTINUOUS,
00068       INTEGER,
00070       NOMINAL
00071   };
00072   
00073  private:
00075   String name_;
00077   Type type_;
00079   ArrayList<String> value_names_;
00080   
00081   OBJECT_TRAVERSAL(DatasetFeature) {
00082     OT_OBJ(name_);
00083     
00084     OT_ENUM_EXPERT(type_, int,
00085       OT_ENUM_VAL(CONTINUOUS)
00086       OT_ENUM_VAL(INTEGER)
00087       OT_ENUM_VAL(NOMINAL));
00088     OT_OBJ(value_names_);
00089   }
00090 
00096  void InitGeneral(const char *name_in) {
00097     name_.Copy(name_in);
00098     value_names_.Init();
00099  }
00100 
00101  public:
00107   void InitContinuous(const char *name_in) {
00108     InitGeneral(name_in);
00109     type_ = CONTINUOUS;
00110   }
00111 
00117   void InitInteger(const char *name_in) {
00118     InitGeneral(name_in);
00119     type_ = INTEGER;
00120   }
00121 
00131   void InitNominal(const char *name_in) {
00132     InitGeneral(name_in);
00133     type_ = NOMINAL;
00134   }
00135   
00146   void Format(double value, String *result) const;
00147   
00160   success_t Parse(const char *str, double *d) const;
00161   
00167   const String& name() const {
00168     return name_;
00169   }
00170   
00176   Type type() const {
00177     return type_;
00178   }
00179   
00187   const String& value_name(int value) const {
00188     DEBUG_ASSERT(type_ == NOMINAL);
00189     return value_names_[value];
00190   }
00191   
00200   index_t n_values() const {
00201     return value_names_.size();
00202   }
00203   
00211   ArrayList<String>& value_names() {
00212     return value_names_;
00213   }
00214 };
00215 
00219 class DatasetInfo {
00220  private:
00221   String name_;
00222   ArrayList<DatasetFeature> features_;
00223 
00224   OBJECT_TRAVERSAL(DatasetInfo) {
00225     OT_OBJ(name_);
00226     OT_OBJ(features_);
00227   }
00228 
00229  public:
00231   ArrayList<DatasetFeature>& features() {
00232     return features_;
00233   }
00234 
00236   const DatasetFeature& feature(index_t attrib_num) const {
00237     return features_[attrib_num];
00238   }
00239 
00241   index_t n_features() const {
00242     return features_.size();
00243   }
00244   
00246   const char *name() const {
00247     return name_;
00248   }
00249   
00251   void set_name(const char *name_in) {
00252     name_.Destruct();
00253     name_.Copy(name_in);
00254   }
00255 
00259   bool is_all_continuous() const;
00260 
00267   void InitContinuous(index_t n_features,
00268       const char *name_in = "dataset");
00269 
00277   void Init(const char *name_in = "dataset");
00278 
00282   void WriteArffHeader(TextWriter *writer) const;
00283   
00290   void WriteCsvHeader(const char *sep, TextWriter *writer) const;
00291 
00299   void WriteMatrix(const Matrix& matrix, const char *sep,
00300       TextWriter *writer) const;
00301 
00313   success_t InitFromArff(TextLineReader *reader,
00314       const char *filename = "dataset");
00315   
00323   success_t InitFromCsv(TextLineReader *reader,
00324       const char *filename = "dataset");
00325 
00333   success_t InitFromFile(TextLineReader *reader,
00334       const char *filename = "dataset");
00344   success_t ReadMatrix(TextLineReader *reader, Matrix *matrix) const;
00345 
00356   success_t ReadPoint(TextLineReader *reader, double *point,
00357       bool *is_done) const;
00358 
00359  private:
00360   char *SkipSpace_(char *s);
00361 
00362   char *SkipNonspace_(char *s);
00363 
00364   void SkipBlanks_(TextLineReader *reader);
00365 
00366 };
00367 
00368 
00380 class Dataset {
00381  private:
00382   Matrix matrix_;
00383   DatasetInfo info_;
00384   
00385   OBJECT_TRAVERSAL(Dataset) {
00386     OT_OBJ(matrix_);
00387     OT_OBJ(info_);
00388   }
00389   
00390  public:
00400   const DatasetInfo& info() const {
00401     return info_;
00402   }
00403   
00408   DatasetInfo& info() {
00409     return info_;
00410   }
00411   
00419   index_t n_features() const {
00420     return matrix_.n_rows();
00421   }
00422   
00430   index_t n_points() const {
00431     return matrix_.n_cols();
00432   }
00433 
00442   index_t n_labels() const;
00443 
00464   void GetLabels(ArrayList<double> &labels_list,
00465                  ArrayList<index_t> &labels_index,
00466                  ArrayList<index_t> &labels_ct,
00467                  ArrayList<index_t> &labels_startpos) const;
00468  
00475   double get(index_t feature, index_t point) const {
00476     return matrix_.get(feature, point);
00477   }
00478   
00482   int get_int(index_t feature, index_t point) const {
00483     double d = get(feature, point);
00484     int i = int(d);
00485     DEBUG_ASSERT(d == double(i));
00486     return i;
00487   }
00488   
00496   void set(index_t feature, index_t point, double d) {
00497     matrix_.set(feature, point, d);
00498   }
00499   
00505   const double *point(index_t point) const {
00506     return matrix_.GetColumnPtr(point);
00507   }
00513   double *point(index_t point) {
00514     return matrix_.GetColumnPtr(point);
00515   }
00516   
00520   const Matrix& matrix() const {
00521     return matrix_;
00522   }
00527   Matrix& matrix() {
00528     return matrix_;
00529   }
00530   
00538   void Format(index_t feature, index_t point, String *result) const {
00539     info_.feature(feature).Format(get(feature, point), result);
00540   }
00541   
00549   void InitBlank() {
00550   }
00551   
00560   success_t InitFromFile(const char *fname);
00561   
00572   success_t InitFromFile(TextLineReader *reader,
00573       const char *filename = "dataset");
00574   
00582   success_t WriteCsv(const char *fname, bool header = false) const;
00583 
00589   success_t WriteArff(const char *fname) const;
00590 
00597   void CopyMatrix(const Matrix& matrix_in) {
00598     InitBlank();
00599     matrix_.Copy(matrix_in);
00600     info_.InitContinuous(matrix_.n_rows());
00601   }
00602   
00613   void OwnMatrix(Matrix* matrix_in) {
00614     InitBlank();
00615     matrix_.Own(matrix_in);
00616     info_.InitContinuous(matrix_.n_rows());
00617   }
00618   
00629   void AliasMatrix(const Matrix& matrix_in) {
00630     InitBlank();
00631     matrix_.Alias(matrix_in);
00632     info_.InitContinuous(matrix_.n_rows());
00633   }
00634   
00635   
00636 
00652   void SplitTrainTest(int folds, int fold_number,
00653       const ArrayList<index_t>& permutation,
00654       Dataset *train, Dataset *test) const;
00655 };
00656 
00660 namespace data {
00675   success_t Load(const char *fname, Matrix *matrix);
00690   template<typename Precision>
00691   success_t LargeLoad(const char *fname, GenMatrix<Precision> *matrix) {
00692     TextLineReader *reader = new TextLineReader();
00693     if (reader->Open(fname)==SUCCESS_FAIL) {
00694       reader->Error("Couldn't open %s", fname);
00695       return SUCCESS_FAIL;
00696     } 
00697     index_t dimension=0;
00698     String line=reader->Peek();
00699     ArrayList<String> result;
00700     result.Init();
00701     line.Split(",", &result);
00702     dimension=result.size();
00703     while (reader->Gobble()) {
00704     }
00705     matrix->StaticInit(dimension, reader->line_num());
00706     matrix->SetAll(0.0);
00707     delete reader;
00708     reader = new TextLineReader();
00709     reader->Open(fname);
00710     while (true) {
00711       String line=reader->Peek();
00712       ArrayList<String> result;
00713       result.Init();
00714       line.Split(",", &result);
00715       for(index_t i=0; i<result.size(); i++) {
00716         Precision num;
00717         sscanf(result[i].c_str(), "%lf", &num);
00718         matrix->set(i, reader->line_num()-1, (Precision)num);
00719       }
00720       if (reader->Gobble()==false) {
00721         break;
00722       }
00723     }
00724     return SUCCESS_PASS;  
00725   }
00726 
00727 
00742   success_t Save(const char *fname, const Matrix& matrix);
00743 };
00744 
00745 #endif