dataset_preprocess.cc

00001 /* MLPACK 0.2
00002  *
00003  * Copyright (c) 2008, 2009 Alexander Gray,
00004  *                          Garry Boyer,
00005  *                          Ryan Riegel,
00006  *                          Nikolaos Vasiloglou,
00007  *                          Dongryeol Lee,
00008  *                          Chip Mappus, 
00009  *                          Nishant Mehta,
00010  *                          Hua Ouyang,
00011  *                          Parikshit Ram,
00012  *                          Long Tran,
00013  *                          Wee Chin Wong
00014  *
00015  * Copyright (c) 2008, 2009 Georgia Institute of Technology
00016  *
00017  * This program is free software; you can redistribute it and/or
00018  * modify it under the terms of the GNU General Public License as
00019  * published by the Free Software Foundation; either version 2 of the
00020  * License, or (at your option) any later version.
00021  *
00022  * This program is distributed in the hope that it will be useful, but
00023  * WITHOUT ANY WARRANTY; without even the implied warranty of
00024  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00025  * General Public License for more details.
00026  *
00027  * You should have received a copy of the GNU General Public License
00028  * along with this program; if not, write to the Free Software
00029  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00030  * 02110-1301, USA.
00031  */
00032 #include "fastlib/fastlib.h"
00033 
00034 void FindIndexWithPrefix(Dataset &dataset, char *prefix,
00035                          ArrayList<int> &remove_indices, 
00036                          bool keep_going_after_first_match) {
00037 
00038   // Get the dataset information containing the feature types and
00039   // names.
00040   DatasetInfo &info = dataset.info();
00041   ArrayList<DatasetFeature> &features = info.features();
00042 
00043   for(index_t i = 0; i < features.size(); i++) {
00044 
00045     // If a feature name with the desired prefix has been found, then
00046     // make sure it hasn't been selected before. If so, then add to
00047     // the remove indices.
00048     const String &feature_name = features[i].name();
00049 
00050     if(!strncmp(prefix, feature_name.c_str(), strlen(prefix) - 1)) {
00051       
00052       bool does_not_exist_yet = true;
00053       for(index_t j = 0; j < remove_indices.size(); j++) {
00054         if(remove_indices[j] == i) {
00055           does_not_exist_yet = false;
00056           break;
00057         }
00058       }
00059       if(does_not_exist_yet) {
00060         printf("Found: %s at position %d.\n", feature_name.c_str(), i);
00061         remove_indices.PushBackCopy(i);
00062         
00063         if(!keep_going_after_first_match) {
00064           break;
00065         }
00066       }
00067     }    
00068   }  
00069 }
00070 
00071 int main(int argc, char *argv[]) {
00072   fx_init(argc, argv, NULL);
00073 
00074   // Read in the dataset from the file.
00075   Dataset initial_dataset;
00076   const char *dataset_name = fx_param_str_req(fx_root, "data");
00077   if(initial_dataset.InitFromFile(dataset_name) != SUCCESS_PASS) {
00078     FATAL("Could ont read the dataset %s", dataset_name);
00079   }
00080 
00081   // Now examine each feature name of the dataset, and construct the
00082   // indices.
00083   ArrayList<int> remove_indices;
00084   remove_indices.Init();
00085   char buffer[1000];
00086   do {
00087     printf("Input the prefix of the feature that you want to remove ");
00088     printf("(just press enter if you are done): ");
00089     fgets(buffer, 998, stdin);
00090 
00091     if(strlen(buffer) == 1) {
00092       break;
00093     }
00094     FindIndexWithPrefix(initial_dataset, buffer, remove_indices, false);
00095   } while(true);
00096 
00097   ArrayList<int> prune_indices;
00098   prune_indices.Init();
00099   do {
00100     printf("Input the prefix of the feature that you want to consider for pruning ");
00101     printf("(just press enter if you are done): ");
00102     fgets(buffer, 998, stdin);
00103     
00104     if(strlen(buffer) == 1) {
00105       break;
00106     }
00107     FindIndexWithPrefix(initial_dataset, buffer, prune_indices, true);
00108   } while(true);
00109 
00110   // Output the indices to the file based on the results above.
00111   FILE *predictor_file = fopen("predictor_indices.csv", "w+");
00112   FILE *prune_file = fopen("prune_indices.csv", "w+");
00113 
00114   for(index_t i = 0; i < initial_dataset.matrix().n_rows(); i++) {
00115     bool to_be_removed = false;
00116     for(index_t j = 0; j < remove_indices.size(); j++) {
00117       if(remove_indices[j] == i) {
00118         to_be_removed = true;
00119         break;
00120       }
00121     }
00122     if(!to_be_removed) {
00123       fprintf(predictor_file, "%d\n", i);
00124     }
00125   }
00126   for(index_t i = 0; i < prune_indices.size(); i++) {
00127     fprintf(prune_file, "%d\n", prune_indices[i]);
00128   }
00129   fclose(predictor_file);
00130   fclose(prune_file);
00131 
00132   fx_done(fx_root);
00133   return 0;
00134 }
Generated on Mon Jan 24 12:04:38 2011 for FASTlib by  doxygen 1.6.3