dataset_preprocess.cc
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #include "fastlib/fastlib.h"
00033
00034 void FindIndexWithPrefix(Dataset &dataset, char *prefix,
00035 ArrayList<int> &remove_indices,
00036 bool keep_going_after_first_match) {
00037
00038
00039
00040 DatasetInfo &info = dataset.info();
00041 ArrayList<DatasetFeature> &features = info.features();
00042
00043 for(index_t i = 0; i < features.size(); i++) {
00044
00045
00046
00047
00048 const String &feature_name = features[i].name();
00049
00050 if(!strncmp(prefix, feature_name.c_str(), strlen(prefix) - 1)) {
00051
00052 bool does_not_exist_yet = true;
00053 for(index_t j = 0; j < remove_indices.size(); j++) {
00054 if(remove_indices[j] == i) {
00055 does_not_exist_yet = false;
00056 break;
00057 }
00058 }
00059 if(does_not_exist_yet) {
00060 printf("Found: %s at position %d.\n", feature_name.c_str(), i);
00061 remove_indices.PushBackCopy(i);
00062
00063 if(!keep_going_after_first_match) {
00064 break;
00065 }
00066 }
00067 }
00068 }
00069 }
00070
00071 int main(int argc, char *argv[]) {
00072 fx_init(argc, argv, NULL);
00073
00074
00075 Dataset initial_dataset;
00076 const char *dataset_name = fx_param_str_req(fx_root, "data");
00077 if(initial_dataset.InitFromFile(dataset_name) != SUCCESS_PASS) {
00078 FATAL("Could ont read the dataset %s", dataset_name);
00079 }
00080
00081
00082
00083 ArrayList<int> remove_indices;
00084 remove_indices.Init();
00085 char buffer[1000];
00086 do {
00087 printf("Input the prefix of the feature that you want to remove ");
00088 printf("(just press enter if you are done): ");
00089 fgets(buffer, 998, stdin);
00090
00091 if(strlen(buffer) == 1) {
00092 break;
00093 }
00094 FindIndexWithPrefix(initial_dataset, buffer, remove_indices, false);
00095 } while(true);
00096
00097 ArrayList<int> prune_indices;
00098 prune_indices.Init();
00099 do {
00100 printf("Input the prefix of the feature that you want to consider for pruning ");
00101 printf("(just press enter if you are done): ");
00102 fgets(buffer, 998, stdin);
00103
00104 if(strlen(buffer) == 1) {
00105 break;
00106 }
00107 FindIndexWithPrefix(initial_dataset, buffer, prune_indices, true);
00108 } while(true);
00109
00110
00111 FILE *predictor_file = fopen("predictor_indices.csv", "w+");
00112 FILE *prune_file = fopen("prune_indices.csv", "w+");
00113
00114 for(index_t i = 0; i < initial_dataset.matrix().n_rows(); i++) {
00115 bool to_be_removed = false;
00116 for(index_t j = 0; j < remove_indices.size(); j++) {
00117 if(remove_indices[j] == i) {
00118 to_be_removed = true;
00119 break;
00120 }
00121 }
00122 if(!to_be_removed) {
00123 fprintf(predictor_file, "%d\n", i);
00124 }
00125 }
00126 for(index_t i = 0; i < prune_indices.size(); i++) {
00127 fprintf(prune_file, "%d\n", prune_indices[i]);
00128 }
00129 fclose(predictor_file);
00130 fclose(prune_file);
00131
00132 fx_done(fx_root);
00133 return 0;
00134 }