00001 /* MLPACK 0.2
00002  *
00003  * Copyright (c) 2008, 2009 Alexander Gray,
00004  *                          Garry Boyer,
00005  *                          Ryan Riegel,
00006  *                          Nikolaos Vasiloglou,
00007  *                          Dongryeol Lee,
00008  *                          Chip Mappus, 
00009  *                          Nishant Mehta,
00010  *                          Hua Ouyang,
00011  *                          Parikshit Ram,
00012  *                          Long Tran,
00013  *                          Wee Chin Wong
00014  *
00015  * Copyright (c) 2008, 2009 Georgia Institute of Technology
00016  *
00017  * This program is free software; you can redistribute it and/or
00018  * modify it under the terms of the GNU General Public License as
00019  * published by the Free Software Foundation; either version 2 of the
00020  * License, or (at your option) any later version.
00021  *
00022  * This program is distributed in the hope that it will be useful, but
00023  * WITHOUT ANY WARRANTY; without even the implied warranty of
00025  * General Public License for more details.
00026  *
00027  * You should have received a copy of the GNU General Public License
00028  * along with this program; if not, write to the Free Software
00029  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00030  * 02110-1301, USA.
00031  */
00054 #include "svm.h"
00055 #include "fastlib/math/statistics.h"
00057 const fx_entry_doc svm_main_entries_doc[] = {
00058   {"learner_name", FX_REQUIRED, FX_STR, NULL,
00059    "  The name of the support vecotr learner, values: \"svm_c\" for classification, \"svm_r\" for regression, \"svm_de\" for one class SVM\n"},
00060   {"mode", FX_REQUIRED, FX_STR, NULL,
00061    "  The mode of svm_main, values: \"cv\", \"train\", \"train_test\", \"test\".\n"},
00062   {"k_cv", FX_PARAM, FX_INT, NULL,
00063    "  The number of folds for cross validation, only required under \"cv\" mode.\n"},
00064   {"cv_data", FX_PARAM, FX_STR, NULL,
00065    "  The file name for cross validation data, only required under \"cv\" mode.\n"},
00066   {"train_data", FX_PARAM, FX_STR, NULL,
00067    "  The file name for training data, only required under \"train\" or \"train_test\" mode.\n"},
00068   {"test_data", FX_PARAM, FX_STR, NULL,
00069    "  The file name for testing data, only required under \"test\" or \"train_test\" mode.\n"},
00070   {"kernel", FX_REQUIRED, FX_STR, NULL,
00071    "  Kernel name, values:\"linear\", \"gaussian\".\n"},
00072   {"sigma", FX_PARAM, FX_DOUBLE, NULL,
00073    "  (for Gaussian kernel) sigma in the gaussian kernel k(x1,x2)=exp(-(x1-x2)^2/(2sigma^2)), only required when using \"guassian\" kernel\n"},
00074   {"c", FX_PARAM, FX_DOUBLE, NULL,
00075    "  (for SVM_C) the weight (0~1) that controls compromise between large margins and small margin violations. Default value: 10.0.\n"},
00076   {"c_p", FX_PARAM, FX_DOUBLE, NULL,
00077    "  (for SVM_C) the weight (0~1) for the positive class (y==1). Default value: c.\n"},
00078   {"c_n", FX_PARAM, FX_DOUBLE, NULL,
00079    "  (for SVM_C) the weight (0~1) for the negative class (y==-1). Default value: c.\n"},
00080   {"epsilon", FX_PARAM, FX_DOUBLE, NULL,
00081    "  (for SVM_R) the epsilon in SVM regression of epsilon-insensitive loss. Default value: 0.1.\n"},
00082   {"wss", FX_PARAM, FX_INT, NULL,
00083    "  Working set selection scheme. 1 for 1st order expansion; 2 for 2nd order expansion. Default value: 1.\n"},
00084   {"normalize", FX_PARAM, FX_BOOL, NULL,
00085    "  Whether need to do data normalization before training/testing, values: \"0\" for no normalize, \"1\" for normalize.\n"},
00087 };
00089 const fx_module_doc svm_main_doc = {
00090   svm_main_entries_doc, NULL,
00091   "These are the implementations for Support Vector Machines (SVM), including Multiclass classification, Regression, and One Class SVM)\n"
00092 };
00099 void DoSvmNormalize(Dataset* dataset) {
00100   Matrix m;
00101   Vector sums;
00103   m.Init(dataset->n_features()-1, dataset->n_points());
00104   sums.Init(dataset->n_features() - 1);
00105   sums.SetZero();
00107   for (index_t i = 0; i < dataset->n_points(); i++) {
00108     Vector s;
00109     Vector d;
00110     dataset->matrix().MakeColumnSubvector(i, 0, dataset->n_features()-1, &s);
00111     m.MakeColumnVector(i, &d);
00112     d.CopyValues(s);
00113     la::AddTo(s, &sums);
00114   }
00116   la::Scale(-1.0 / dataset->n_points(), &sums);
00117   for (index_t i = 0; i < dataset->n_points(); i++) {
00118     Vector d;
00119     m.MakeColumnVector(i, &d);
00120     la::AddTo(sums, &d);
00121   }
00123   Matrix cov;
00125   la::MulTransBInit(m, m, &cov);
00127   Vector d;
00128   Matrix u; // eigenvectors
00129   Matrix ui; // the inverse of eigenvectors
00131   PASSED(la::EigenvectorsInit(cov, &d, &u));
00132   la::TransposeInit(u, &ui);
00134   for (index_t i = 0; i < d.length(); i++) {
00135     d[i] = 1.0 / sqrt(d[i] / (dataset->n_points() - 1));
00136   }
00138   la::ScaleRows(d, &ui);
00140   Matrix cov_inv_half;
00141   la::MulInit(u, ui, &cov_inv_half);
00143   Matrix final;
00144   la::MulInit(cov_inv_half, m, &final);
00146   for (index_t i = 0; i < dataset->n_points(); i++) {
00147     Vector s;
00148     Vector d;
00149     dataset->matrix().MakeColumnSubvector(i, 0, dataset->n_features()-1, &d);
00150     final.MakeColumnVector(i, &s);
00151     d.CopyValues(s);
00152   }
00154   if (fx_param_bool(NULL, "save", 0)) {
00155     fx_default_param(NULL, "kfold/save", "1");
00156     dataset->WriteCsv("m_normalized.csv");
00157   }
00158 }
00165 void GenerateArtificialDataset(Dataset* dataset){
00166   Matrix m;
00167   index_t n = fx_param_int(NULL, "n", 30);
00168   double offset = fx_param_double(NULL, "offset", 0.0);
00169   double range = fx_param_double(NULL, "range", 1.0);
00170   double slope = fx_param_double(NULL, "slope", 1.0);
00171   double margin = fx_param_double(NULL, "margin", 1.0);
00172   double var = fx_param_double(NULL, "var", 1.0);
00173   double intercept = fx_param_double(NULL, "intercept", 0.0);
00175   // 2 dimensional dataset, size n, 3 classes
00176   m.Init(3, n);
00177   for (index_t i = 0; i < n; i += 3) {
00178     double x;
00179     double y;
00181     x = (rand() * range / RAND_MAX) + offset;
00182     y = margin / 2 + (rand() * var / RAND_MAX);
00183     m.set(0, i, x);
00184     m.set(1, i, x*slope + y + intercept);
00185     m.set(2, i, 0); // labels
00187     x = (rand() * range / RAND_MAX) + offset;
00188     y = margin / 2 + (rand() * var / RAND_MAX);
00189     m.set(0, i+1, 10*x);
00190     m.set(1, i+1, x*slope + y + intercept);
00191     m.set(2, i+1, 1); // labels
00193     x = (rand() * range / RAND_MAX) + offset;
00194     y = margin / 2 + (rand() * var / RAND_MAX);
00195     m.set(0, i+2, 20*x);
00196     m.set(1, i+2, x*slope + y + intercept);
00197     m.set(2, i+2, 2); // labels
00198   }
00200   data::Save("artificialdata.csv", m); // TODO, for training, for testing
00201   dataset->OwnMatrix(&m);
00202 }
00211 int LoadData(Dataset* dataset, String datafilename){
00212   if (fx_param_exists(NULL, datafilename)) {
00213     // when a data file is specified, use it.
00214     if ( !PASSED(dataset->InitFromFile( fx_param_str_req(NULL, datafilename) )) ) {
00215     fprintf(stderr, "Couldn't open the data file.\n");
00216     return 0;
00217     }
00218   } 
00219   else {
00220     fprintf(stderr, "No data file exist. Generating artificial dataset.\n");
00221     // otherwise, generate an artificial dataset and save it to "m.csv"
00222     GenerateArtificialDataset(dataset);
00223   }
00225   if (fx_param_bool(NULL, "normalize", 1)) {
00226     fprintf(stderr, "Normalizing...\n");
00227     DoSvmNormalize(dataset);
00228   } else {
00229     fprintf(stderr, "Skipping normalization...\n");
00230   }
00231   return 1;
00232 }
00240 int main(int argc, char *argv[]) {
00241   //fx_init(argc, argv, NULL);
00242   fx_module *root = fx_init(argc, argv, &svm_main_doc);
00243   srand(time(NULL));
00245   String mode = fx_param_str_req(NULL, "mode");
00246   String kernel = fx_param_str_req(NULL, "kernel");
00247   String learner_name = fx_param_str_req(root,"learner_name");
00248   int learner_typeid;
00250   if (learner_name == "svm_c") { // Support Vector Classfication
00251     learner_typeid = 0;
00252   }
00253   else if (learner_name == "svm_r") { // Support Vector Regression
00254     learner_typeid = 1;
00255   }
00256   else if (learner_name == "svm_de") { // One Class Support Vector Machine
00257     learner_typeid = 2;
00258   }
00259   else {
00260     fprintf(stderr, "Unknown support vector learner name! Program stops!\n");
00261     return 0;
00262   }
00264   // TODO: more kernels to be supported
00266   /* Cross Validation Mode, need cross validation data */
00267   if(mode == "cv") { 
00268     fprintf(stderr, "SVM Cross Validation... \n");
00270     /* Load cross validation data */
00271     Dataset cvset;
00272     if (LoadData(&cvset, "cv_data") == 0)
00273     return 1;
00275     if (kernel == "linear") {
00276       GeneralCrossValidator< SVM<SVMLinearKernel> > cross_validator; 
00277       /* Initialize n_folds_, confusion_matrix_; k_cv: number of cross-validation folds, need k_cv>1 */
00278       cross_validator.Init(learner_typeid, fx_param_int_req(NULL,"k_cv"), &cvset, fx_root, "svm");
00279       /* k_cv folds cross validation; (true): do training set permutation */
00280       cross_validator.Run(true);
00281       //cross_validator.confusion_matrix().PrintDebug("confusion matrix");
00282     }
00283     else if (kernel == "gaussian") {
00284       GeneralCrossValidator< SVM<SVMRBFKernel> > cross_validator; 
00285       /* Initialize n_folds_, confusion_matrix_; k_cv: number of cross-validation folds */
00286       cross_validator.Init(learner_typeid, fx_param_int_req(NULL,"k_cv"), &cvset, fx_root, "svm");
00287       /* k_cv folds cross validation; (true): do training set permutation */
00288       cross_validator.Run(true);
00289       //cross_validator.confusion_matrix().PrintDebug("confusion matrix");
00290     }
00291   }
00292   /* Training Mode, need training data | Training + Testing(online) Mode, need training data + testing data */
00293   else if (mode=="train" || mode=="train_test"){
00294     fprintf(stderr, "SVM Training... \n");
00296     /* Load training data */
00297     Dataset trainset;
00298     if (LoadData(&trainset, "train_data") == 0) // TODO:param_req
00299       return 1;
00301     /* Begin SVM Training | Training and Testing */
00302     datanode *svm_module = fx_submodule(fx_root, "svm");
00304     if (kernel == "linear") {
00305       SVM<SVMLinearKernel> svm;
00306       svm.InitTrain(learner_typeid, trainset, svm_module);
00307       /* training and testing, thus no need to load model from file */
00308       if (mode=="train_test"){
00309         fprintf(stderr, "SVM Predicting... \n");
00310         /* Load testing data */
00311         Dataset testset;
00312         if (LoadData(&testset, "test_data") == 0) // TODO:param_req
00313           return 1;
00314         svm.BatchPredict(learner_typeid, testset, "predicted_values");
00315       }
00316     }
00317     else if (kernel == "gaussian") {
00318       SVM<SVMRBFKernel> svm;
00319       svm.InitTrain(learner_typeid, trainset, svm_module);
00320       /* training and testing, thus no need to load model from file */
00321       if (mode=="train_test"){
00322         fprintf(stderr, "SVM Predicting... \n");
00323         /* Load testing data */
00324         Dataset testset;
00325         if (LoadData(&testset, "test_data") == 0) // TODO:param_req
00326           return 1;
00327         svm.BatchPredict(learner_typeid, testset, "predicted_values"); // TODO:param_req
00328       }
00329     }
00330   }
00331   /* Testing(offline) Mode, need loading model file and testing data */
00332   else if (mode=="test") {
00333     fprintf(stderr, "SVM Predicting... \n");
00335     /* Load testing data */
00336     Dataset testset;
00337     if (LoadData(&testset, "test_data") == 0) // TODO:param_req
00338       return 1;
00340     /* Begin Prediction */
00341     datanode *svm_module = fx_submodule(fx_root, "svm");
00343     if (kernel == "linear") {
00344       SVM<SVMLinearKernel> svm;
00345       svm.Init(learner_typeid, testset, svm_module); 
00346       svm.LoadModelBatchPredict(learner_typeid, testset, "svm_model", "predicted_values"); // TODO:param_req
00347     }
00348     else if (kernel == "gaussian") {
00349       SVM<SVMRBFKernel> svm;
00350       svm.Init(learner_typeid, testset, svm_module); 
00351       svm.LoadModelBatchPredict(learner_typeid, testset, "svm_model", "predicted_values"); // TODO:param_req
00352     }
00353   }
00354   fx_done(NULL);
00355 }
