00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00043 #include "fastlib/base/base.h"
00044
00045 #include "fastlib/data/dataset.h"
00046
00047
00048
00049 void DatasetFeature::Format(double value, String *result) const {
00050 if (unlikely(isnan(value))) {
00051 result->Copy("?");
00052 return;
00053 }
00054 switch (type_) {
00055 case CONTINUOUS:
00056 if (floor(value) != value) {
00057
00058 result->InitSprintf("%1.17e", value);
00059 } else {
00060
00061 result->InitSprintf("%.17g", value);
00062 }
00063 break;
00064 case INTEGER: result->InitSprintf("%lu", long(value)); break;
00065 case NOMINAL: result->InitCopy(value_name(int(value))); break;
00066 #ifdef DEBUG
00067 default: abort();
00068 #endif
00069 }
00070 }
00071
00072 success_t DatasetFeature::Parse(const char *str, double *d) const {
00073 if (unlikely(str[0] == '?') && unlikely(str[1] == '\0')) {
00074 *d = DBL_NAN;
00075 return SUCCESS_PASS;
00076 }
00077 switch (type_) {
00078 case CONTINUOUS: {
00079 char *end;
00080 *d = strtod(str, &end);
00081 if (likely(*end == '\0')) {
00082 return SUCCESS_PASS;
00083 } else {
00084 return SUCCESS_FAIL;
00085 }
00086 }
00087 case INTEGER: {
00088 int i;
00089 if (sscanf(str, "%d", &i) == 1) {
00090 *d = i;
00091 return SUCCESS_PASS;
00092 } else {
00093 return SUCCESS_FAIL;
00094 }
00095 }
00096 case NOMINAL: {
00097 index_t i;
00098 for (i = 0; i < value_names_.size(); i++) {
00099 if (value_names_[i] == str) {
00100 *d = i;
00101 return SUCCESS_PASS;
00102 }
00103 }
00104 *d = DBL_NAN;
00105 return SUCCESS_FAIL;
00106 }
00107 default: abort();
00108 }
00109 }
00110
00111
00112
00113
00114 void DatasetInfo::InitContinuous(index_t n_features,
00115 const char *name_in) {
00116 features_.Init(n_features);
00117
00118 name_.Copy(name_in);
00119
00120 for (index_t i = 0; i < n_features; i++) {
00121 String feature_name;
00122 feature_name.InitSprintf("feature_%d", int(i));
00123 features_[i].InitContinuous(feature_name);
00124 }
00125 }
00126
00127 void DatasetInfo::Init(const char *name_in) {
00128 features_.Init();
00129 name_.Copy(name_in);
00130 }
00131
00132 char *DatasetInfo::SkipSpace_(char *s) {
00133 while (isspace(*s)) {
00134 s++;
00135 }
00136
00137 if (unlikely(*s == '%') || unlikely(*s == '\0')) {
00138 return s + strlen(s);
00139 }
00140
00141 return s;
00142 }
00143
00144 char *DatasetInfo::SkipNonspace_(char *s) {
00145 while (likely(*s != '\0')
00146 && likely(*s != '%')
00147 && likely(*s != ' ')
00148 && likely(*s != '\t')) {
00149 s++;
00150 }
00151
00152 return s;
00153 }
00154
00155 void DatasetInfo::SkipBlanks_(TextLineReader *reader) {
00156 while (reader->MoreLines() && *SkipSpace_(reader->Peek().begin()) == '\0') {
00157 reader->Gobble();
00158 }
00159 }
00160
00161 success_t DatasetInfo::InitFromArff(TextLineReader *reader,
00162 const char *filename) {
00163 success_t result = SUCCESS_PASS;
00164
00165 Init(filename);
00166
00167 while (1) {
00168 SkipBlanks_(reader);
00169
00170 String *peeked = &reader->Peek();
00171 ArrayList<String> portions;
00172
00173 portions.Init();
00174 peeked->Split(0, " \t", "%", 3, &portions);
00175
00176 if (portions.size() == 0) {
00177
00178 } else if (portions[0][0] != '@') {
00179 reader->Error("ARFF: Unexpected @command. Did you forget @data?");
00180 result = SUCCESS_FAIL;
00181 break;
00182 } else {
00183 if (portions[0].EqualsNoCase("@relation")) {
00184 if (portions.size() < 2) {
00185 reader->Error("ARFF: @relation requires name");
00186 result = SUCCESS_FAIL;
00187 } else {
00188 set_name(portions[1]);
00189 }
00190 } else if (portions[0].EqualsNoCase("@attribute")) {
00191 if (portions.size() < 3) {
00192 reader->Error("ARFF: @attribute requires name and type.");
00193 result = SUCCESS_FAIL;
00194 } else {
00195 if (portions[2][0] == '{') {
00196 DatasetFeature *feature = &features_.PushBack();
00197
00198 feature->InitNominal(portions[1]);
00199
00200 portions[2].Split(1, ", \t", "}%", 0, &feature->value_names());
00201 } else {
00202 String type(portions[2]);
00203
00204 if (type.EqualsNoCase("numeric")
00205 || type.EqualsNoCase("real")) {
00206 features_.PushBack().InitContinuous(portions[1]);
00207 } else if (type.EqualsNoCase("integer")) {
00208 features_.PushBack().InitInteger(portions[1]);
00209 } else {
00210 reader->Error(
00211 "ARFF: Only support 'numeric', 'real', and {nominal}.");
00212 result = SUCCESS_FAIL;
00213 }
00214 }
00215 }
00216 } else if (portions[0].EqualsNoCase("@data")) {
00217
00218 reader->Gobble();
00219 break;
00220 } else {
00221 reader->Error("ARFF: Expected @relation, @attribute, or @data.");
00222 result = SUCCESS_FAIL;
00223 break;
00224 }
00225 }
00226
00227 reader->Gobble();
00228 }
00229
00230 return result;
00231 }
00232
00233 success_t DatasetInfo::InitFromCsv(TextLineReader *reader,
00234 const char *filename) {
00235 ArrayList<String> headers;
00236 bool nonnumeric = false;
00237
00238 Init(filename);
00239
00240 headers.Init();
00241 reader->Peek().Split(", \t", &headers);
00242
00243 if (headers.size() == 0) {
00244 reader->Error("Trying to parse empty file as CSV.");
00245 return SUCCESS_FAIL;
00246 }
00247
00248
00249 for (index_t i = 0; i < headers.size(); i++) {
00250 char *end;
00251
00252 (void) strtod(headers[i], &end);
00253
00254 if (end != headers[i].end()) {
00255 nonnumeric = true;
00256 break;
00257 }
00258 }
00259
00260 if (nonnumeric) {
00261 for (index_t i = 0; i < headers.size(); i++) {
00262 features_.PushBack().InitContinuous(headers[i]);
00263 }
00264 reader->Gobble();
00265 } else {
00266 for (index_t i = 0; i < headers.size(); i++) {
00267 String name;
00268 #ifndef LI
00269 #define LI ""
00270 #endif
00271 name.InitSprintf("feature%"LI"d", i);
00272 features_.PushBack().InitContinuous(name);
00273 }
00274 }
00275
00276 return SUCCESS_PASS;
00277 }
00278
00279 success_t DatasetInfo::InitFromFile(TextLineReader *reader,
00280 const char *filename) {
00281 SkipBlanks_(reader);
00282
00283 char *first_line = SkipSpace_(reader->Peek().begin());
00284
00285 if (!first_line) {
00286 Init();
00287 reader->Error("Could not parse the first line.");
00288 return SUCCESS_FAIL;
00289 } else if (*first_line == '@') {
00290
00291 return InitFromArff(reader, filename);
00292 } else {
00293
00294 return InitFromCsv(reader, filename);
00295 }
00296 }
00297
00298 index_t Dataset::n_labels() const {
00299 index_t i = 0;
00300 index_t label_row_idx = matrix_.n_rows() - 1;
00301 index_t n_labels = 0;
00302
00303 double current_label;
00304
00305 ArrayList<double> labels_list;
00306 labels_list.Init();
00307 labels_list.PushBack() = matrix_.get(label_row_idx,0);
00308 n_labels++;
00309
00310 for (i = 1; i < matrix_.n_cols(); i++) {
00311 current_label = matrix_.get(label_row_idx,i);
00312 index_t j = 0;
00313 for (j = 0; j < n_labels; j++) {
00314 if (current_label == labels_list[j]) {
00315 break;
00316 }
00317 }
00318 if (j == n_labels) {
00319 labels_list.PushBack() = current_label;
00320 n_labels++;
00321 }
00322 }
00323 labels_list.Clear();
00324 return n_labels;
00325 }
00326
00327 void Dataset::GetLabels(ArrayList<double> &labels_list,
00328 ArrayList<index_t> &labels_index,
00329 ArrayList<index_t> &labels_ct,
00330 ArrayList<index_t> &labels_startpos) const {
00331 index_t i = 0;
00332 index_t label_row_idx = matrix_.n_rows() - 1;
00333 index_t n_points = matrix_.n_cols();
00334 index_t n_labels = 0;
00335
00336 double current_label;
00337
00338
00339 labels_list.Renew();
00340 labels_index.Renew();
00341 labels_ct.Renew();
00342 labels_startpos.Renew();
00343
00344 labels_index.Init(n_points);
00345 labels_list.Init();
00346 labels_ct.Init();
00347 labels_startpos.Init();
00348
00349 ArrayList<index_t> labels_temp;
00350 labels_temp.Init(n_points);
00351 labels_temp[0] = 0;
00352
00353 labels_list.PushBack() = matrix_.get(label_row_idx,0);
00354 labels_ct.PushBack() = 1;
00355 n_labels++;
00356
00357 for (i = 1; i < n_points; i++) {
00358 current_label = matrix_.get(label_row_idx, i);
00359 index_t j = 0;
00360 for (j = 0; j < n_labels; j++) {
00361 if (current_label == labels_list[j]) {
00362 labels_ct[j]++;
00363 break;
00364 }
00365 }
00366 labels_temp[i] = j;
00367 if (j == n_labels) {
00368 labels_list.PushBack() = current_label;
00369 labels_ct.PushBack() = 1;
00370 n_labels++;
00371 }
00372 }
00373
00374 labels_startpos.PushBack() = 0;
00375 for(i = 1; i < n_labels; i++){
00376 labels_startpos.PushBack() = labels_startpos[i-1] + labels_ct[i-1];
00377 }
00378
00379 for(i = 0; i < n_points; i++) {
00380 labels_index[labels_startpos[labels_temp[i]]] = i;
00381 labels_startpos[labels_temp[i]]++;
00382 }
00383
00384 labels_startpos[0] = 0;
00385 for(i = 1; i < n_labels; i++)
00386 labels_startpos[i] = labels_startpos[i-1] + labels_ct[i-1];
00387
00388 labels_temp.Clear();
00389 }
00390
00391 bool DatasetInfo::is_all_continuous() const {
00392 for (index_t i = 0; i < features_.size(); i++) {
00393 if (features_[i].type() != DatasetFeature::CONTINUOUS) {
00394 return false;
00395 }
00396 }
00397
00398 return true;
00399 }
00400
00401 success_t DatasetInfo::ReadMatrix(TextLineReader *reader, Matrix *matrix) const {
00402 ArrayList<double> linearized;
00403 index_t n_features = this->n_features();
00404 index_t n_points = 0;
00405 success_t retval = SUCCESS_PASS;
00406 bool is_done;
00407
00408 linearized.Init();
00409
00410 do {
00411 double *point = linearized.PushBackRaw(n_features);
00412 retval = ReadPoint(reader, point, &is_done);
00413 n_points++;
00414 } while (!is_done && !FAILED(retval));
00415
00416 if (!FAILED(retval)) {
00417 DEBUG_ASSERT(linearized.size() == n_features * n_points);
00418 DEBUG_ASSERT(linearized.size() >= n_features);
00419 DEBUG_ASSERT(linearized.size() % n_features == 0);
00420 n_points--;
00421 linearized.Resize(n_features * n_points);
00422 }
00423
00424 linearized.Trim();
00425
00426 matrix->Own(linearized.ReleasePtr(), n_features, n_points);
00427
00428 return retval;
00429 }
00430
00431 success_t DatasetInfo::ReadPoint(TextLineReader *reader, double *point,
00432 bool *is_done) const {
00433 index_t n_features = this->n_features();
00434 char *pos;
00435
00436 *is_done = false;
00437
00438 for (;;) {
00439 if (!reader->MoreLines()) {
00440 *is_done = true;
00441 return SUCCESS_PASS;
00442 }
00443
00444 pos = reader->Peek().begin();
00445
00446 while (*pos == ' ' || *pos == '\t' || *pos == ',') {
00447 pos++;
00448 }
00449
00450 if (unlikely(*pos == '\0' || *pos == '%')) {
00451 reader->Gobble();
00452 } else {
00453 break;
00454 }
00455 }
00456
00457 for (index_t i = 0; i < n_features; i++) {
00458 char *next;
00459
00460 while (*pos == ' ' || *pos == '\t' || *pos == ',') {
00461 pos++;
00462 }
00463
00464 if (unlikely(*pos == '\0')) {
00465 for (char *s = reader->Peek().begin(); s < pos; s++) {
00466 if (!*s) {
00467 *s = ',';
00468 }
00469 }
00470 reader->Error("I am expecting %"LI"d entries per row, "
00471 "but this line has only %"LI"d.",
00472 n_features, i);
00473 return SUCCESS_FAIL;
00474 }
00475
00476 next = pos;
00477 while (*next != '\0' && *next != ' ' && *next != '\t' && *next != ','
00478 && *next != '%') {
00479 next++;
00480 }
00481
00482 if (*next != '\0') {
00483 char c = *next;
00484 *next = '\0';
00485 if (c != '%') {
00486 next++;
00487 }
00488 }
00489
00490 if (!PASSED(features_[i].Parse(pos, &point[i]))) {
00491 char *end = reader->Peek().end();
00492 String tmp;
00493 tmp.Copy(pos);
00494 for (char *s = reader->Peek().begin(); s < next && s < end; s++) {
00495 if (*s == '\0') {
00496 *s = ',';
00497 }
00498 }
00499 reader->Error("Invalid parse: [%s]", tmp.c_str());
00500 return SUCCESS_FAIL;
00501 }
00502
00503 pos = next;
00504 }
00505
00506 while (*pos == ' ' || *pos == '\t' || *pos == ',') {
00507 pos++;
00508 }
00509
00510 if (*pos != '\0') {
00511 for (char *s = reader->Peek().begin(); s < pos; s++) {
00512 if (*s == '\0') {
00513 *s = ',';
00514 }
00515 }
00516 reader->Error("Extra junk on line.");
00517 return SUCCESS_FAIL;
00518 }
00519
00520 reader->Gobble();
00521
00522 return SUCCESS_PASS;
00523 }
00524
00525
00526 void DatasetInfo::WriteArffHeader(TextWriter *writer) const {
00527 writer->Printf("@relation %s\n", name_.c_str());
00528
00529 for (index_t i = 0; i < features_.size(); i++) {
00530 const DatasetFeature *feature = &features_[i];
00531 writer->Printf("@attribute %s ", feature->name().c_str());
00532 if (feature->type() == DatasetFeature::NOMINAL) {
00533 writer->Printf("{");
00534 for (index_t v = 0; v < feature->n_values(); v++) {
00535 if (v != 0) {
00536 writer->Write(",");
00537 }
00538 writer->Write(feature->value_name(v).c_str());
00539 }
00540 writer->Printf("}");
00541 } else {
00542 writer->Write("real");
00543 }
00544 writer->Write("\n");
00545 }
00546 writer->Printf("@data\n");
00547 }
00548
00549 void DatasetInfo::WriteCsvHeader(const char *sep, TextWriter *writer) const {
00550 for (index_t i = 0; i < features_.size(); i++) {
00551 if (i != 0) {
00552 writer->Write(sep);
00553 }
00554 writer->Write(features_[i].name().c_str());
00555 }
00556 writer->Write("\n");
00557 }
00558
00559 void DatasetInfo::WriteMatrix(const Matrix& matrix, const char *sep,
00560 TextWriter *writer) const {
00561 for (index_t i = 0; i < matrix.n_cols(); i++) {
00562 for (index_t f = 0; f < features_.size(); f++) {
00563 if (f != 0) {
00564 writer->Write(sep);
00565 }
00566 String str;
00567 features_[f].Format(matrix.get(f, i), &str);
00568 writer->Write(str);
00569 }
00570 writer->Write("\n");
00571 }
00572 }
00573
00574
00575
00576 success_t Dataset::InitFromFile(const char *fname) {
00577 TextLineReader reader;
00578
00579 if (PASSED(reader.Open(fname))) {
00580 return InitFromFile(&reader, fname);
00581 } else {
00582 matrix_.Init(0, 0);
00583 info_.Init();
00584 NONFATAL("Could not open file '%s' for reading.", fname);
00585 return SUCCESS_FAIL;
00586 }
00587 }
00588
00589 success_t Dataset::InitFromFile(TextLineReader *reader,
00590 const char *filename) {
00591 success_t result;
00592
00593 result = info_.InitFromFile(reader, filename);
00594 if (PASSED(result)) {
00595 result = info_.ReadMatrix(reader, &matrix_);
00596 } else {
00597 matrix_.Init(0, 0);
00598 }
00599
00600 return result;
00601 }
00602
00603
00604 success_t Dataset::WriteCsv(const char *fname, bool header) const {
00605 TextWriter writer;
00606
00607 if (!PASSED(writer.Open(fname))) {
00608 NONFATAL("Couldn't open '%s' for writing.", fname);
00609 return SUCCESS_FAIL;
00610 } else {
00611 if (header) {
00612 info_.WriteCsvHeader(",\t", &writer);
00613 }
00614 info_.WriteMatrix(matrix_, ",\t", &writer);
00615 return writer.Close();
00616 }
00617 }
00618
00619 success_t Dataset::WriteArff(const char *fname) const {
00620 TextWriter writer;
00621
00622 if (!PASSED(writer.Open(fname))) {
00623 NONFATAL("Couldn't open '%s' for writing.", fname);
00624 return SUCCESS_FAIL;
00625 } else {
00626 info_.WriteArffHeader(&writer);
00627 info_.WriteMatrix(matrix_, ",", &writer);
00628 return writer.Close();
00629 }
00630 }
00631
00632 void Dataset::SplitTrainTest(int folds, int fold_number,
00633 const ArrayList<index_t>& permutation,
00634 Dataset *train, Dataset *test) const {
00635 index_t n_test = (n_points() + folds - fold_number - 1) / folds;
00636 index_t n_train = n_points() - n_test;
00637
00638 train->InitBlank();
00639 train->info().InitCopy(info());
00640
00641 test->InitBlank();
00642 test->info().InitCopy(info());
00643
00644 train->matrix().Init(n_features(), n_train);
00645 test->matrix().Init(n_features(), n_test);
00646
00647 index_t i_train = 0;
00648 index_t i_test = 0;
00649 index_t i_orig = 0;
00650
00651 for (i_orig = 0; i_orig < n_points(); i_orig++) {
00652 double *dest;
00653
00654 if (unlikely((i_orig - fold_number) % folds == 0)) {
00655 dest = test->matrix().GetColumnPtr(i_test);
00656 i_test++;
00657 } else {
00658 dest = train->matrix().GetColumnPtr(i_train);
00659 i_train++;
00660 }
00661
00662 mem::Copy(dest,
00663 this->matrix().GetColumnPtr(permutation[i_orig]),
00664 n_features());
00665 }
00666
00667 DEBUG_ASSERT(i_train == train->n_points());
00668 DEBUG_ASSERT(i_test == test->n_points());
00669 }
00670
00671 success_t data::Load(const char *fname, Matrix *matrix) {
00672 Dataset dataset;
00673 success_t result = dataset.InitFromFile(fname);
00674 matrix->Own(&dataset.matrix());
00675 return result;
00676 }
00677
00678 success_t data::Save(const char *fname, const Matrix& matrix) {
00679 Dataset dataset;
00680 dataset.AliasMatrix(matrix);
00681 return dataset.WriteCsv(fname);
00682 }
00683