00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00054 #include "svm.h"
00055 #include "fastlib/math/statistics.h"
00056
00057 const fx_entry_doc svm_main_entries_doc[] = {
00058 {"learner_name", FX_REQUIRED, FX_STR, NULL,
00059 " The name of the support vecotr learner, values: \"svm_c\" for classification, \"svm_r\" for regression, \"svm_de\" for one class SVM\n"},
00060 {"mode", FX_REQUIRED, FX_STR, NULL,
00061 " The mode of svm_main, values: \"cv\", \"train\", \"train_test\", \"test\".\n"},
00062 {"k_cv", FX_PARAM, FX_INT, NULL,
00063 " The number of folds for cross validation, only required under \"cv\" mode.\n"},
00064 {"cv_data", FX_PARAM, FX_STR, NULL,
00065 " The file name for cross validation data, only required under \"cv\" mode.\n"},
00066 {"train_data", FX_PARAM, FX_STR, NULL,
00067 " The file name for training data, only required under \"train\" or \"train_test\" mode.\n"},
00068 {"test_data", FX_PARAM, FX_STR, NULL,
00069 " The file name for testing data, only required under \"test\" or \"train_test\" mode.\n"},
00070 {"kernel", FX_REQUIRED, FX_STR, NULL,
00071 " Kernel name, values:\"linear\", \"gaussian\".\n"},
00072 {"sigma", FX_PARAM, FX_DOUBLE, NULL,
00073 " (for Gaussian kernel) sigma in the gaussian kernel k(x1,x2)=exp(-(x1-x2)^2/(2sigma^2)), only required when using \"guassian\" kernel\n"},
00074 {"c", FX_PARAM, FX_DOUBLE, NULL,
00075 " (for SVM_C) the weight (0~1) that controls compromise between large margins and small margin violations. Default value: 10.0.\n"},
00076 {"c_p", FX_PARAM, FX_DOUBLE, NULL,
00077 " (for SVM_C) the weight (0~1) for the positive class (y==1). Default value: c.\n"},
00078 {"c_n", FX_PARAM, FX_DOUBLE, NULL,
00079 " (for SVM_C) the weight (0~1) for the negative class (y==-1). Default value: c.\n"},
00080 {"epsilon", FX_PARAM, FX_DOUBLE, NULL,
00081 " (for SVM_R) the epsilon in SVM regression of epsilon-insensitive loss. Default value: 0.1.\n"},
00082 {"wss", FX_PARAM, FX_INT, NULL,
00083 " Working set selection scheme. 1 for 1st order expansion; 2 for 2nd order expansion. Default value: 1.\n"},
00084 {"normalize", FX_PARAM, FX_BOOL, NULL,
00085 " Whether need to do data normalization before training/testing, values: \"0\" for no normalize, \"1\" for normalize.\n"},
00086 FX_ENTRY_DOC_DONE
00087 };
00088
00089 const fx_module_doc svm_main_doc = {
00090 svm_main_entries_doc, NULL,
00091 "These are the implementations for Support Vector Machines (SVM), including Multiclass classification, Regression, and One Class SVM)\n"
00092 };
00093
00099 void DoSvmNormalize(Dataset* dataset) {
00100 Matrix m;
00101 Vector sums;
00102
00103 m.Init(dataset->n_features()-1, dataset->n_points());
00104 sums.Init(dataset->n_features() - 1);
00105 sums.SetZero();
00106
00107 for (index_t i = 0; i < dataset->n_points(); i++) {
00108 Vector s;
00109 Vector d;
00110 dataset->matrix().MakeColumnSubvector(i, 0, dataset->n_features()-1, &s);
00111 m.MakeColumnVector(i, &d);
00112 d.CopyValues(s);
00113 la::AddTo(s, &sums);
00114 }
00115
00116 la::Scale(-1.0 / dataset->n_points(), &sums);
00117 for (index_t i = 0; i < dataset->n_points(); i++) {
00118 Vector d;
00119 m.MakeColumnVector(i, &d);
00120 la::AddTo(sums, &d);
00121 }
00122
00123 Matrix cov;
00124
00125 la::MulTransBInit(m, m, &cov);
00126
00127 Vector d;
00128 Matrix u;
00129 Matrix ui;
00130
00131 PASSED(la::EigenvectorsInit(cov, &d, &u));
00132 la::TransposeInit(u, &ui);
00133
00134 for (index_t i = 0; i < d.length(); i++) {
00135 d[i] = 1.0 / sqrt(d[i] / (dataset->n_points() - 1));
00136 }
00137
00138 la::ScaleRows(d, &ui);
00139
00140 Matrix cov_inv_half;
00141 la::MulInit(u, ui, &cov_inv_half);
00142
00143 Matrix final;
00144 la::MulInit(cov_inv_half, m, &final);
00145
00146 for (index_t i = 0; i < dataset->n_points(); i++) {
00147 Vector s;
00148 Vector d;
00149 dataset->matrix().MakeColumnSubvector(i, 0, dataset->n_features()-1, &d);
00150 final.MakeColumnVector(i, &s);
00151 d.CopyValues(s);
00152 }
00153
00154 if (fx_param_bool(NULL, "save", 0)) {
00155 fx_default_param(NULL, "kfold/save", "1");
00156 dataset->WriteCsv("m_normalized.csv");
00157 }
00158 }
00159
00165 void GenerateArtificialDataset(Dataset* dataset){
00166 Matrix m;
00167 index_t n = fx_param_int(NULL, "n", 30);
00168 double offset = fx_param_double(NULL, "offset", 0.0);
00169 double range = fx_param_double(NULL, "range", 1.0);
00170 double slope = fx_param_double(NULL, "slope", 1.0);
00171 double margin = fx_param_double(NULL, "margin", 1.0);
00172 double var = fx_param_double(NULL, "var", 1.0);
00173 double intercept = fx_param_double(NULL, "intercept", 0.0);
00174
00175
00176 m.Init(3, n);
00177 for (index_t i = 0; i < n; i += 3) {
00178 double x;
00179 double y;
00180
00181 x = (rand() * range / RAND_MAX) + offset;
00182 y = margin / 2 + (rand() * var / RAND_MAX);
00183 m.set(0, i, x);
00184 m.set(1, i, x*slope + y + intercept);
00185 m.set(2, i, 0);
00186
00187 x = (rand() * range / RAND_MAX) + offset;
00188 y = margin / 2 + (rand() * var / RAND_MAX);
00189 m.set(0, i+1, 10*x);
00190 m.set(1, i+1, x*slope + y + intercept);
00191 m.set(2, i+1, 1);
00192
00193 x = (rand() * range / RAND_MAX) + offset;
00194 y = margin / 2 + (rand() * var / RAND_MAX);
00195 m.set(0, i+2, 20*x);
00196 m.set(1, i+2, x*slope + y + intercept);
00197 m.set(2, i+2, 2);
00198 }
00199
00200 data::Save("artificialdata.csv", m);
00201 dataset->OwnMatrix(&m);
00202 }
00203
00211 int LoadData(Dataset* dataset, String datafilename){
00212 if (fx_param_exists(NULL, datafilename)) {
00213
00214 if ( !PASSED(dataset->InitFromFile( fx_param_str_req(NULL, datafilename) )) ) {
00215 fprintf(stderr, "Couldn't open the data file.\n");
00216 return 0;
00217 }
00218 }
00219 else {
00220 fprintf(stderr, "No data file exist. Generating artificial dataset.\n");
00221
00222 GenerateArtificialDataset(dataset);
00223 }
00224
00225 if (fx_param_bool(NULL, "normalize", 1)) {
00226 fprintf(stderr, "Normalizing...\n");
00227 DoSvmNormalize(dataset);
00228 } else {
00229 fprintf(stderr, "Skipping normalization...\n");
00230 }
00231 return 1;
00232 }
00233
00240 int main(int argc, char *argv[]) {
00241
00242 fx_module *root = fx_init(argc, argv, &svm_main_doc);
00243 srand(time(NULL));
00244
00245 String mode = fx_param_str_req(NULL, "mode");
00246 String kernel = fx_param_str_req(NULL, "kernel");
00247 String learner_name = fx_param_str_req(root,"learner_name");
00248 int learner_typeid;
00249
00250 if (learner_name == "svm_c") {
00251 learner_typeid = 0;
00252 }
00253 else if (learner_name == "svm_r") {
00254 learner_typeid = 1;
00255 }
00256 else if (learner_name == "svm_de") {
00257 learner_typeid = 2;
00258 }
00259 else {
00260 fprintf(stderr, "Unknown support vector learner name! Program stops!\n");
00261 return 0;
00262 }
00263
00264
00265
00266
00267 if(mode == "cv") {
00268 fprintf(stderr, "SVM Cross Validation... \n");
00269
00270
00271 Dataset cvset;
00272 if (LoadData(&cvset, "cv_data") == 0)
00273 return 1;
00274
00275 if (kernel == "linear") {
00276 GeneralCrossValidator< SVM<SVMLinearKernel> > cross_validator;
00277
00278 cross_validator.Init(learner_typeid, fx_param_int_req(NULL,"k_cv"), &cvset, fx_root, "svm");
00279
00280 cross_validator.Run(true);
00281
00282 }
00283 else if (kernel == "gaussian") {
00284 GeneralCrossValidator< SVM<SVMRBFKernel> > cross_validator;
00285
00286 cross_validator.Init(learner_typeid, fx_param_int_req(NULL,"k_cv"), &cvset, fx_root, "svm");
00287
00288 cross_validator.Run(true);
00289
00290 }
00291 }
00292
00293 else if (mode=="train" || mode=="train_test"){
00294 fprintf(stderr, "SVM Training... \n");
00295
00296
00297 Dataset trainset;
00298 if (LoadData(&trainset, "train_data") == 0)
00299 return 1;
00300
00301
00302 datanode *svm_module = fx_submodule(fx_root, "svm");
00303
00304 if (kernel == "linear") {
00305 SVM<SVMLinearKernel> svm;
00306 svm.InitTrain(learner_typeid, trainset, svm_module);
00307
00308 if (mode=="train_test"){
00309 fprintf(stderr, "SVM Predicting... \n");
00310
00311 Dataset testset;
00312 if (LoadData(&testset, "test_data") == 0)
00313 return 1;
00314 svm.BatchPredict(learner_typeid, testset, "predicted_values");
00315 }
00316 }
00317 else if (kernel == "gaussian") {
00318 SVM<SVMRBFKernel> svm;
00319 svm.InitTrain(learner_typeid, trainset, svm_module);
00320
00321 if (mode=="train_test"){
00322 fprintf(stderr, "SVM Predicting... \n");
00323
00324 Dataset testset;
00325 if (LoadData(&testset, "test_data") == 0)
00326 return 1;
00327 svm.BatchPredict(learner_typeid, testset, "predicted_values");
00328 }
00329 }
00330 }
00331
00332 else if (mode=="test") {
00333 fprintf(stderr, "SVM Predicting... \n");
00334
00335
00336 Dataset testset;
00337 if (LoadData(&testset, "test_data") == 0)
00338 return 1;
00339
00340
00341 datanode *svm_module = fx_submodule(fx_root, "svm");
00342
00343 if (kernel == "linear") {
00344 SVM<SVMLinearKernel> svm;
00345 svm.Init(learner_typeid, testset, svm_module);
00346 svm.LoadModelBatchPredict(learner_typeid, testset, "svm_model", "predicted_values");
00347 }
00348 else if (kernel == "gaussian") {
00349 SVM<SVMRBFKernel> svm;
00350 svm.Init(learner_typeid, testset, svm_module);
00351 svm.LoadModelBatchPredict(learner_typeid, testset, "svm_model", "predicted_values");
00352 }
00353 }
00354 fx_done(NULL);
00355 }
00356