dataset.cc

Go to the documentation of this file.
00001 /* MLPACK 0.2
00002  *
00003  * Copyright (c) 2008, 2009 Alexander Gray,
00004  *                          Garry Boyer,
00005  *                          Ryan Riegel,
00006  *                          Nikolaos Vasiloglou,
00007  *                          Dongryeol Lee,
00008  *                          Chip Mappus, 
00009  *                          Nishant Mehta,
00010  *                          Hua Ouyang,
00011  *                          Parikshit Ram,
00012  *                          Long Tran,
00013  *                          Wee Chin Wong
00014  *
00015  * Copyright (c) 2008, 2009 Georgia Institute of Technology
00016  *
00017  * This program is free software; you can redistribute it and/or
00018  * modify it under the terms of the GNU General Public License as
00019  * published by the Free Software Foundation; either version 2 of the
00020  * License, or (at your option) any later version.
00021  *
00022  * This program is distributed in the hope that it will be useful, but
00023  * WITHOUT ANY WARRANTY; without even the implied warranty of
00024  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00025  * General Public License for more details.
00026  *
00027  * You should have received a copy of the GNU General Public License
00028  * along with this program; if not, write to the Free Software
00029  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00030  * 02110-1301, USA.
00031  */
00043 #include "fastlib/base/base.h"
00044 
00045 #include "fastlib/data/dataset.h"
00046 //#include "dataset.h"
00047 
00048 
00049 void DatasetFeature::Format(double value, String *result) const {
00050   if (unlikely(isnan(value))) {
00051     result->Copy("?");
00052     return;
00053   }
00054   switch (type_) {
00055     case CONTINUOUS:
00056       if (floor(value) != value) {
00057         // non-integer
00058         result->InitSprintf("%1.17e", value);
00059       } else {
00060         // value is actually an integer
00061         result->InitSprintf("%.17g", value);
00062       }
00063       break;
00064     case INTEGER: result->InitSprintf("%lu", long(value)); break;
00065     case NOMINAL: result->InitCopy(value_name(int(value))); break;
00066     #ifdef DEBUG
00067     default: abort();
00068     #endif
00069   }
00070 }
00071 
00072 success_t DatasetFeature::Parse(const char *str, double *d) const {
00073   if (unlikely(str[0] == '?') && unlikely(str[1] == '\0')) {
00074     *d = DBL_NAN;
00075     return SUCCESS_PASS;
00076   }
00077   switch (type_) {
00078     case CONTINUOUS: {
00079         char *end;
00080         *d = strtod(str, &end);
00081         if (likely(*end == '\0')) {
00082           return SUCCESS_PASS;
00083         } else {
00084           return SUCCESS_FAIL;
00085         }
00086       }
00087     case INTEGER: {
00088       int i;
00089       if (sscanf(str, "%d", &i) == 1) {
00090         *d = i;
00091         return SUCCESS_PASS;
00092       } else {
00093         return SUCCESS_FAIL;
00094       }
00095     }
00096     case NOMINAL: {
00097       index_t i;
00098       for (i = 0; i < value_names_.size(); i++) {
00099         if (value_names_[i] == str) {
00100           *d = i;
00101           return SUCCESS_PASS;
00102         }
00103       }
00104       *d = DBL_NAN;
00105       return SUCCESS_FAIL;
00106     }
00107     default: abort();
00108   }
00109 }
00110 
00111 // DatasetInfo ------------------------------------------------------
00112 
00113 
00114 void DatasetInfo::InitContinuous(index_t n_features,
00115     const char *name_in) {
00116   features_.Init(n_features);
00117 
00118   name_.Copy(name_in);
00119 
00120   for (index_t i = 0; i < n_features; i++) {
00121     String feature_name;
00122     feature_name.InitSprintf("feature_%d", int(i));
00123     features_[i].InitContinuous(feature_name);
00124   }
00125 }
00126 
00127 void DatasetInfo::Init(const char *name_in) {
00128   features_.Init();
00129   name_.Copy(name_in);
00130 }
00131 
00132 char *DatasetInfo::SkipSpace_(char *s) {
00133   while (isspace(*s)) {
00134     s++;
00135   }
00136 
00137   if (unlikely(*s == '%') || unlikely(*s == '\0')) {
00138     return s + strlen(s);
00139   }
00140 
00141   return s;
00142 }
00143 
00144 char *DatasetInfo::SkipNonspace_(char *s) {
00145   while (likely(*s != '\0')
00146       && likely(*s != '%')
00147       && likely(*s != ' ')
00148       && likely(*s != '\t')) {
00149     s++;
00150   }
00151 
00152   return s;
00153 }
00154 
00155 void DatasetInfo::SkipBlanks_(TextLineReader *reader) {
00156   while (reader->MoreLines() && *SkipSpace_(reader->Peek().begin()) == '\0') {
00157     reader->Gobble();
00158   }
00159 }
00160 
00161 success_t DatasetInfo::InitFromArff(TextLineReader *reader,
00162     const char *filename) {
00163   success_t result = SUCCESS_PASS;
00164 
00165   Init(filename);
00166 
00167   while (1) {
00168     SkipBlanks_(reader);
00169 
00170     String *peeked = &reader->Peek();
00171     ArrayList<String> portions;
00172 
00173     portions.Init();
00174     peeked->Split(0, " \t", "%", 3, &portions);
00175 
00176     if (portions.size() == 0) {
00177       /* empty line */
00178     } else if (portions[0][0] != '@') {
00179       reader->Error("ARFF: Unexpected @command.  Did you forget @data?");
00180       result = SUCCESS_FAIL;
00181       break;
00182     } else {
00183       if (portions[0].EqualsNoCase("@relation")) {
00184         if (portions.size() < 2) {
00185           reader->Error("ARFF: @relation requires name");
00186           result = SUCCESS_FAIL;
00187         } else {
00188           set_name(portions[1]);
00189         }
00190       } else if (portions[0].EqualsNoCase("@attribute")) {
00191         if (portions.size() < 3) {
00192           reader->Error("ARFF: @attribute requires name and type.");
00193           result = SUCCESS_FAIL;
00194         } else {
00195           if (portions[2][0] == '{') { //}
00196             DatasetFeature *feature = &features_.PushBack();
00197 
00198             feature->InitNominal(portions[1]);
00199             // TODO: Doesn't support values with spaces {
00200             portions[2].Split(1, ", \t", "}%", 0, &feature->value_names());
00201           } else {
00202             String type(portions[2]);
00203             //portions[2].Trim(" \t", &type);
00204             if (type.EqualsNoCase("numeric")
00205                 || type.EqualsNoCase("real")) {
00206               features_.PushBack().InitContinuous(portions[1]);
00207             } else if (type.EqualsNoCase("integer")) {
00208               features_.PushBack().InitInteger(portions[1]);
00209             } else {
00210               reader->Error(
00211                   "ARFF: Only support 'numeric', 'real', and {nominal}.");
00212               result = SUCCESS_FAIL;
00213             }
00214           }
00215         }
00216       } else if (portions[0].EqualsNoCase("@data")) {
00217         /* Done! */
00218         reader->Gobble();
00219         break;
00220       } else {
00221         reader->Error("ARFF: Expected @relation, @attribute, or @data.");
00222         result = SUCCESS_FAIL;
00223         break;
00224       }
00225     }
00226 
00227     reader->Gobble();
00228   }
00229 
00230   return result;
00231 }
00232 
00233 success_t DatasetInfo::InitFromCsv(TextLineReader *reader,
00234     const char *filename) {
00235   ArrayList<String> headers;
00236   bool nonnumeric = false;
00237 
00238   Init(filename);
00239 
00240   headers.Init();
00241   reader->Peek().Split(", \t", &headers);
00242 
00243   if (headers.size() == 0) {
00244     reader->Error("Trying to parse empty file as CSV.");
00245     return SUCCESS_FAIL;
00246   }
00247 
00248   // Try to auto-detect if there is a header row
00249   for (index_t i = 0; i < headers.size(); i++) {
00250     char *end;
00251 
00252     (void) strtod(headers[i], &end);
00253 
00254     if (end != headers[i].end()) {
00255       nonnumeric = true;
00256       break;
00257     }
00258   }
00259 
00260   if (nonnumeric) {
00261     for (index_t i = 0; i < headers.size(); i++) {
00262       features_.PushBack().InitContinuous(headers[i]);
00263     }
00264     reader->Gobble();
00265   } else {
00266     for (index_t i = 0; i < headers.size(); i++) {
00267       String name;
00268 #ifndef LI
00269 #define LI ""
00270 #endif
00271       name.InitSprintf("feature%"LI"d", i);
00272       features_.PushBack().InitContinuous(name);
00273     }
00274   }
00275 
00276   return SUCCESS_PASS;
00277 }
00278 
00279 success_t DatasetInfo::InitFromFile(TextLineReader *reader,
00280     const char *filename) {
00281   SkipBlanks_(reader);
00282 
00283   char *first_line = SkipSpace_(reader->Peek().begin());
00284 
00285   if (!first_line) {
00286     Init();
00287     reader->Error("Could not parse the first line.");
00288     return SUCCESS_FAIL;
00289   } else if (*first_line == '@') {
00290     /* Okay, it's ARFF. */
00291     return InitFromArff(reader, filename);
00292   } else {
00293     /* It's CSV.  We'll try to see if there are headers. */
00294     return InitFromCsv(reader, filename);
00295   }
00296 }
00297 
00298 index_t Dataset::n_labels() const {
00299   index_t i = 0;
00300   index_t label_row_idx = matrix_.n_rows() - 1; // the last row is for labels
00301   index_t n_labels = 0;
00302 
00303   double current_label;
00304   
00305   ArrayList<double> labels_list;
00306   labels_list.Init();
00307   labels_list.PushBack() = matrix_.get(label_row_idx,0); 
00308   n_labels++;
00309 
00310   for (i = 1; i < matrix_.n_cols(); i++) {
00311     current_label = matrix_.get(label_row_idx,i);
00312     index_t j = 0;
00313     for (j = 0; j < n_labels; j++) {
00314       if (current_label == labels_list[j]) {
00315         break;
00316       }
00317     }
00318     if (j == n_labels) { // new label
00319       labels_list.PushBack() = current_label;
00320       n_labels++;
00321     }
00322   }
00323   labels_list.Clear();
00324   return n_labels;
00325 }
00326 
00327 void Dataset::GetLabels(ArrayList<double> &labels_list,
00328                         ArrayList<index_t> &labels_index,
00329                         ArrayList<index_t> &labels_ct,
00330                         ArrayList<index_t> &labels_startpos) const {
00331   index_t i = 0;
00332   index_t label_row_idx = matrix_.n_rows() - 1; // the last row is for labels
00333   index_t n_points = matrix_.n_cols();
00334   index_t n_labels = 0;
00335 
00336   double current_label;
00337 
00338   // these Arraylists need initialization before-hand
00339   labels_list.Renew();
00340   labels_index.Renew();
00341   labels_ct.Renew();
00342   labels_startpos.Renew();
00343 
00344   labels_index.Init(n_points);
00345   labels_list.Init();
00346   labels_ct.Init();
00347   labels_startpos.Init();
00348 
00349   ArrayList<index_t> labels_temp;
00350   labels_temp.Init(n_points);
00351   labels_temp[0] = 0;
00352 
00353   labels_list.PushBack() = matrix_.get(label_row_idx,0);
00354   labels_ct.PushBack() = 1;
00355   n_labels++;
00356 
00357   for (i = 1; i < n_points; i++) {
00358     current_label = matrix_.get(label_row_idx, i);
00359     index_t j = 0;
00360     for (j = 0; j < n_labels; j++) {
00361       if (current_label == labels_list[j]) {
00362         labels_ct[j]++;
00363         break;
00364       }
00365     }
00366     labels_temp[i] = j;
00367     if (j == n_labels) { // new label
00368       labels_list.PushBack() = current_label; // add new label to list
00369       labels_ct.PushBack() = 1;
00370       n_labels++;
00371     }
00372   }
00373   
00374   labels_startpos.PushBack() = 0;
00375   for(i = 1; i < n_labels; i++){
00376     labels_startpos.PushBack() = labels_startpos[i-1] + labels_ct[i-1];
00377   }
00378 
00379   for(i = 0; i < n_points; i++) {
00380     labels_index[labels_startpos[labels_temp[i]]] = i;
00381     labels_startpos[labels_temp[i]]++;
00382   }
00383 
00384   labels_startpos[0] = 0;
00385   for(i = 1; i < n_labels; i++)
00386     labels_startpos[i] = labels_startpos[i-1] + labels_ct[i-1];
00387 
00388   labels_temp.Clear();
00389 }
00390 
00391 bool DatasetInfo::is_all_continuous() const {
00392   for (index_t i = 0; i < features_.size(); i++) {
00393     if (features_[i].type() != DatasetFeature::CONTINUOUS) {
00394       return false;
00395     }
00396   }
00397 
00398   return true;
00399 }
00400 
00401 success_t DatasetInfo::ReadMatrix(TextLineReader *reader, Matrix *matrix) const {
00402   ArrayList<double> linearized;
00403   index_t n_features = this->n_features();
00404   index_t n_points = 0;
00405   success_t retval = SUCCESS_PASS;
00406   bool is_done;
00407 
00408   linearized.Init();
00409   
00410   do {
00411     double *point = linearized.PushBackRaw(n_features);
00412     retval = ReadPoint(reader, point, &is_done);
00413     n_points++;
00414   } while (!is_done && !FAILED(retval));
00415 
00416   if (!FAILED(retval)) {
00417     DEBUG_ASSERT(linearized.size() == n_features * n_points);
00418     DEBUG_ASSERT(linearized.size() >= n_features);
00419     DEBUG_ASSERT(linearized.size() % n_features == 0);
00420     n_points--;
00421     linearized.Resize(n_features * n_points);
00422   }
00423 
00424   linearized.Trim();
00425 
00426   matrix->Own(linearized.ReleasePtr(), n_features, n_points);
00427 
00428   return retval;
00429 }
00430 
00431 success_t DatasetInfo::ReadPoint(TextLineReader *reader, double *point,
00432     bool *is_done) const {
00433   index_t n_features = this->n_features();
00434   char *pos;
00435 
00436   *is_done = false;
00437 
00438   for (;;) {
00439     if (!reader->MoreLines()) {
00440       *is_done = true;
00441       return SUCCESS_PASS;
00442     }
00443 
00444     pos = reader->Peek().begin();
00445 
00446     while (*pos == ' ' || *pos == '\t' || *pos == ',') {
00447       pos++;
00448     }
00449 
00450     if (unlikely(*pos == '\0' || *pos == '%')) {
00451       reader->Gobble();
00452     } else {
00453       break;
00454     }
00455   }
00456 
00457   for (index_t i = 0; i < n_features; i++) {
00458     char *next;
00459 
00460     while (*pos == ' ' || *pos == '\t' || *pos == ',') {
00461       pos++;
00462     }
00463 
00464     if (unlikely(*pos == '\0')) {
00465       for (char *s = reader->Peek().begin(); s < pos; s++) {
00466         if (!*s) {
00467           *s = ',';
00468         }
00469       }
00470       reader->Error("I am expecting %"LI"d entries per row, "
00471           "but this line has only %"LI"d.",
00472           n_features, i);
00473       return SUCCESS_FAIL;
00474     }
00475 
00476     next = pos;
00477     while (*next != '\0' && *next != ' ' && *next != '\t' && *next != ','
00478         && *next != '%') {
00479       next++;
00480     }
00481 
00482     if (*next != '\0') {
00483       char c = *next;
00484       *next = '\0';
00485       if (c != '%') {
00486         next++;
00487       }
00488     }
00489 
00490     if (!PASSED(features_[i].Parse(pos, &point[i]))) {
00491       char *end = reader->Peek().end();
00492       String tmp;
00493       tmp.Copy(pos);
00494       for (char *s = reader->Peek().begin(); s < next && s < end; s++) {
00495         if (*s == '\0') {
00496           *s = ',';
00497         }
00498       }
00499       reader->Error("Invalid parse: [%s]", tmp.c_str());
00500       return SUCCESS_FAIL;
00501     }
00502 
00503     pos = next;
00504   }
00505 
00506   while (*pos == ' ' || *pos == '\t' || *pos == ',') {
00507     pos++;
00508   }
00509 
00510   if (*pos != '\0') {
00511     for (char *s = reader->Peek().begin(); s < pos; s++) {
00512       if (*s == '\0') {
00513         *s = ',';
00514       }
00515     }
00516     reader->Error("Extra junk on line.");
00517     return SUCCESS_FAIL;
00518   }
00519 
00520   reader->Gobble();
00521 
00522   return SUCCESS_PASS;
00523 }
00524 
00525 
00526 void DatasetInfo::WriteArffHeader(TextWriter *writer) const {
00527   writer->Printf("@relation %s\n", name_.c_str());
00528 
00529   for (index_t i = 0; i < features_.size(); i++) {
00530     const DatasetFeature *feature = &features_[i];
00531     writer->Printf("@attribute %s ", feature->name().c_str());
00532     if (feature->type() == DatasetFeature::NOMINAL) {
00533       writer->Printf("{");
00534       for (index_t v = 0; v < feature->n_values(); v++) {
00535         if (v != 0) {
00536           writer->Write(",");
00537         }
00538         writer->Write(feature->value_name(v).c_str());
00539       }
00540       writer->Printf("}");
00541     } else {
00542       writer->Write("real");
00543     }
00544     writer->Write("\n");
00545   }
00546   writer->Printf("@data\n");
00547 }
00548 
00549 void DatasetInfo::WriteCsvHeader(const char *sep, TextWriter *writer) const {
00550   for (index_t i = 0; i < features_.size(); i++) {
00551     if (i != 0) {
00552       writer->Write(sep);
00553     }
00554     writer->Write(features_[i].name().c_str());
00555   }
00556   writer->Write("\n");
00557 }
00558 
00559 void DatasetInfo::WriteMatrix(const Matrix& matrix, const char *sep,
00560     TextWriter *writer) const {
00561   for (index_t i = 0; i < matrix.n_cols(); i++) {
00562     for (index_t f = 0; f < features_.size(); f++) {
00563       if (f != 0) {
00564         writer->Write(sep);
00565       }
00566       String str;
00567       features_[f].Format(matrix.get(f, i), &str);
00568       writer->Write(str);
00569     }
00570     writer->Write("\n");
00571   }
00572 }
00573 
00574 // Dataset ------------------------------------------------------------------
00575 
00576 success_t Dataset::InitFromFile(const char *fname) {
00577   TextLineReader reader;
00578 
00579   if (PASSED(reader.Open(fname))) {
00580     return InitFromFile(&reader, fname);
00581   } else {
00582     matrix_.Init(0, 0);
00583     info_.Init();
00584     NONFATAL("Could not open file '%s' for reading.", fname);
00585     return SUCCESS_FAIL;
00586   }
00587 }
00588 
00589 success_t Dataset::InitFromFile(TextLineReader *reader,
00590     const char *filename) {
00591   success_t result;
00592 
00593   result = info_.InitFromFile(reader, filename);
00594   if (PASSED(result)) {
00595     result = info_.ReadMatrix(reader, &matrix_);
00596   } else {
00597     matrix_.Init(0, 0);
00598   }
00599 
00600   return result;
00601 }
00602 
00603 
00604 success_t Dataset::WriteCsv(const char *fname, bool header) const {
00605   TextWriter writer;
00606 
00607   if (!PASSED(writer.Open(fname))) {
00608     NONFATAL("Couldn't open '%s' for writing.", fname);
00609     return SUCCESS_FAIL;
00610   } else {
00611     if (header) {
00612       info_.WriteCsvHeader(",\t", &writer);
00613     }
00614     info_.WriteMatrix(matrix_, ",\t", &writer);
00615     return writer.Close();
00616   }
00617 }
00618 
00619 success_t Dataset::WriteArff(const char *fname) const {
00620   TextWriter writer;
00621 
00622   if (!PASSED(writer.Open(fname))) {
00623     NONFATAL("Couldn't open '%s' for writing.", fname);
00624     return SUCCESS_FAIL;
00625   } else {
00626     info_.WriteArffHeader(&writer);
00627     info_.WriteMatrix(matrix_, ",", &writer);
00628     return writer.Close();
00629   }
00630 }
00631 
00632 void Dataset::SplitTrainTest(int folds, int fold_number,
00633     const ArrayList<index_t>& permutation,
00634     Dataset *train, Dataset *test) const {
00635   index_t n_test = (n_points() + folds - fold_number - 1) / folds;
00636   index_t n_train = n_points() - n_test;
00637 
00638   train->InitBlank();
00639   train->info().InitCopy(info());
00640 
00641   test->InitBlank();
00642   test->info().InitCopy(info());
00643 
00644   train->matrix().Init(n_features(), n_train);
00645   test->matrix().Init(n_features(), n_test);
00646 
00647   index_t i_train = 0;
00648   index_t i_test = 0;
00649   index_t i_orig = 0;
00650 
00651   for (i_orig = 0; i_orig < n_points(); i_orig++) {
00652     double *dest;
00653 
00654     if (unlikely((i_orig - fold_number) % folds == 0)) {
00655       dest = test->matrix().GetColumnPtr(i_test);
00656       i_test++;
00657     } else {
00658       dest = train->matrix().GetColumnPtr(i_train);
00659       i_train++;
00660     }
00661 
00662     mem::Copy(dest,
00663         this->matrix().GetColumnPtr(permutation[i_orig]),
00664         n_features());
00665   }
00666 
00667   DEBUG_ASSERT(i_train == train->n_points());
00668   DEBUG_ASSERT(i_test == test->n_points());
00669 }
00670 
00671 success_t data::Load(const char *fname, Matrix *matrix) {
00672   Dataset dataset;
00673   success_t result = dataset.InitFromFile(fname);
00674   matrix->Own(&dataset.matrix());
00675   return result;
00676 }
00677 
00678 success_t data::Save(const char *fname, const Matrix& matrix) {
00679   Dataset dataset;
00680   dataset.AliasMatrix(matrix);
00681   return dataset.WriteCsv(fname);
00682 }
00683 
Generated on Mon Jan 24 12:04:37 2011 for FASTlib by  doxygen 1.6.3