support.cc

00001 /* MLPACK 0.2
00002  *
00003  * Copyright (c) 2008, 2009 Alexander Gray,
00004  *                          Garry Boyer,
00005  *                          Ryan Riegel,
00006  *                          Nikolaos Vasiloglou,
00007  *                          Dongryeol Lee,
00008  *                          Chip Mappus, 
00009  *                          Nishant Mehta,
00010  *                          Hua Ouyang,
00011  *                          Parikshit Ram,
00012  *                          Long Tran,
00013  *                          Wee Chin Wong
00014  *
00015  * Copyright (c) 2008, 2009 Georgia Institute of Technology
00016  *
00017  * This program is free software; you can redistribute it and/or
00018  * modify it under the terms of the GNU General Public License as
00019  * published by the Free Software Foundation; either version 2 of the
00020  * License, or (at your option) any later version.
00021  *
00022  * This program is distributed in the hope that it will be useful, but
00023  * WITHOUT ANY WARRANTY; without even the implied warranty of
00024  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00025  * General Public License for more details.
00026  *
00027  * You should have received a copy of the GNU General Public License
00028  * along with this program; if not, write to the Free Software
00029  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00030  * 02110-1301, USA.
00031  */
00032 #include "fastlib/fastlib.h"
00033 #include "support.h"
00034 
00035 namespace hmm_support {
00036 
00037   double RAND_UNIFORM_01() {
00038     return (double) rand() / (double)RAND_MAX;
00039   }
00040 
00041   double RAND_UNIFORM(double a, double b) {
00042     return RAND_UNIFORM_01() * (b-a) + a;
00043   }
00044 
00045   void print_matrix(const Matrix& a, const char* msg) {
00046     printf("%s - Matrix (%d x %d) = \n", msg, a.n_rows(), a.n_cols());
00047     for (int i = 0; i < a.n_rows(); i++) {
00048       for (int j = 0; j < a.n_cols(); j++)
00049         printf("%8.4f", a.get(i, j));
00050       printf("\n");
00051     }
00052   }
00053 
00054   void print_matrix(TextWriter& writer, const Matrix& a, const char* msg, const char* format) {
00055     writer.Printf("%s - Matrix (%d x %d) = \n", msg, a.n_rows(), a.n_cols());
00056     for (int j = 0; j < a.n_cols(); j++) {
00057       for (int i = 0; i < a.n_rows(); i++)
00058         writer.Printf(format, a.get(i, j));
00059       writer.Printf("\n");
00060     }
00061   }
00062 
00063   void print_vector(const Vector& a, const char* msg) {
00064     printf("%s - Vector (%d) = \n", msg, a.length());
00065     for (int i = 0; i < a.length(); i++)
00066       printf("%8.4f", a[i]);
00067     printf("\n");
00068   }
00069 
00070   void print_vector(TextWriter& writer, const Vector& a, const char* msg, const char* format) {
00071     writer.Printf("%s - Vector (%d) = \n", msg, a.length());
00072     for (int i = 0; i < a.length(); i++)
00073       writer.Printf(format, a[i]);
00074     writer.Printf("\n");
00075   }
00076 
00077   double RAND_NORMAL_01() {
00078     double r = 2, u, v;
00079     while (r > 1) {
00080       u = RAND_UNIFORM(-1, 1);
00081       v = RAND_UNIFORM(-1, 1);
00082       r = u*u+v*v;
00083     }
00084     return sqrt(-2*log(r)/r)*u;
00085   }
00086 
00087   void RAND_NORMAL_01_INIT(int N, Vector* v) {
00088     double r, u, t;
00089     Vector& v_ = *v;
00090     v_.Init(N);
00091     for (int i = 0; i < N; i+=2) {
00092       r = 2;
00093       while (r > 1) {
00094         u = RAND_UNIFORM(-1, 1);
00095         t = RAND_UNIFORM(-1, 1);
00096         r = u*u+t*t;
00097       }
00098       v_[i] = sqrt(-2*log(r)/r)*u;
00099       if (i+1 < N) v_[i+1] = sqrt(-2*log(r)/r)*t;
00100     }
00101   }
00102 
00103   void RAND_NORMAL_INIT(const Vector& mean, const Matrix& cov, Vector* v) {
00104     int N = mean.length();
00105     Vector v01;
00106     RAND_NORMAL_01_INIT(N, &v01);
00107     la::MulInit(cov, v01, v);
00108     la::AddTo(mean, v);
00109   }
00110 
00111   // return x'Ay
00112   double MyMulExpert(const Vector& x, const Matrix& A, const Vector& y) {
00113     int M = A.n_rows();
00114     int N = A.n_cols();
00115     DEBUG_ASSERT_MSG((M==x.length() && N==y.length()), "MyMulExpert: sizes do not match");
00116 
00117     double s = 0;
00118     for (int i = 0; i < M; i++)
00119       for (int j = 0; j < N; j++)
00120         s += x[i] * A.get(i, j) * y[j];
00121     return s;
00122   }
00123 
00124   double NORMAL_DENSITY(const Vector& x, const Vector& mean, const Matrix& inv_cov, double det_cov) {
00125     Vector d;
00126     la::SubInit(x, mean, &d);
00127     return det_cov * exp(-0.5*MyMulExpert(d, inv_cov, d));
00128   }
00129 
00130   bool kmeans(const ArrayList<Matrix>& data, int num_clusters, 
00131               ArrayList<int> *labels_, ArrayList<Vector> *centroids_, 
00132               int max_iter, double error_thresh)
00133   {
00134     ArrayList<int> counts; //number of points in each cluster
00135     ArrayList<Vector> tmp_centroids;
00136     int num_points, num_dims;
00137     int i, j, num_iter=0;
00138     double error, old_error;
00139 
00140     //Assign pointers to references to avoid repeated dereferencing.
00141     ArrayList<int> &labels = *labels_;
00142     ArrayList<Vector> &centroids = *centroids_;
00143 
00144     num_points = 0;
00145     for (int i = 0; i < data.size(); i++)
00146       num_points+=data[i].n_cols();
00147     if (num_points < num_clusters) 
00148       return false;
00149 
00150     num_dims = data[0].n_rows();
00151 
00152     centroids.Init(num_clusters);
00153     tmp_centroids.Init(num_clusters);
00154 
00155     counts.Init(num_clusters);
00156     labels.Init(num_points);
00157 
00158     //Initialize the clusters to k points
00159     for (j=0; j < num_clusters; j++) {
00160       Vector temp_vector;
00161       int i = (int) math::Random(0, data.size() - 0.5);
00162       int k = (int) math::Random(0, data[i].n_cols()-0.5);
00163       data[i].MakeColumnVector(k, &temp_vector);
00164       centroids[j].Copy(temp_vector);
00165       tmp_centroids[j].Init(num_dims);
00166     }
00167 
00168     error = DBL_MAX;
00169 
00170     do {
00171 
00172       old_error = error; error = 0;
00173 
00174       for (i=0; i < num_clusters; i++) {
00175         tmp_centroids[i].SetZero();
00176             counts[i] = 0;
00177       }
00178       i = 0;
00179       for (int t=0, i = 0; t<data.size(); t++) 
00180       for (int k = 0; k < data[t].n_cols(); k++, i++) {
00181 
00182         // Find the cluster closest to this point and update its label
00183         double min_distance = DBL_MAX;
00184         Vector data_i_Vec;
00185         data[t].MakeColumnVector(k, &data_i_Vec);
00186 
00187         for (j=0; j<num_clusters; j++ ) {
00188           double distance = la::DistanceSqEuclidean(data_i_Vec, centroids[j]);
00189           if (distance < min_distance) {
00190             labels[i] = j;
00191             min_distance = distance;
00192           }
00193         }
00194 
00195         // Accumulate the stats for the new centroid of the target cluster
00196         la::AddTo(data_i_Vec, &(tmp_centroids[labels[i]]));
00197         counts[labels[i]]++;
00198         error += min_distance;
00199       }
00200 
00201       // Now update all the centroids
00202       for (int j=0; j < num_clusters; j++) {
00203         if (counts[j] > 0)
00204           la::ScaleOverwrite((1/(double)counts[j]), 
00205                              tmp_centroids[j], &(centroids[j]));
00206       }
00207       num_iter++;
00208 
00209     } while ((fabs(error - old_error) > error_thresh)
00210              && (num_iter < max_iter));
00211 
00212     return true;
00213 
00214   }
00215 
00216   bool kmeans(Matrix const &data, int num_clusters, 
00217               ArrayList<int> *labels_, ArrayList<Vector> *centroids_, 
00218               int max_iter, double error_thresh)
00219   {
00220     ArrayList<int> counts; //number of points in each cluster
00221     ArrayList<Vector> tmp_centroids;
00222     int num_points, num_dims;
00223     int i, j, num_iter=0;
00224     double error, old_error;
00225     
00226     //Assign pointers to references to avoid repeated dereferencing.
00227     ArrayList<int> &labels = *labels_;
00228     ArrayList<Vector> &centroids = *centroids_;
00229     
00230     if (data.n_cols() < num_clusters) 
00231       return false;
00232     
00233     num_points = data.n_cols();
00234     num_dims = data.n_rows();
00235     
00236     centroids.Init(num_clusters);
00237     tmp_centroids.Init(num_clusters);
00238     
00239     counts.Init(num_clusters);
00240     labels.Init(num_points);
00241     
00242     //Initialize the clusters to k points
00243     for (i=0, j=0; j < num_clusters; i+=num_points/num_clusters, j++) {
00244       Vector temp_vector;
00245       data.MakeColumnVector(i, &temp_vector);
00246       centroids[j].Copy(temp_vector);
00247       tmp_centroids[j].Init(num_dims);
00248     }
00249     
00250     error = DBL_MAX;
00251     
00252     do {
00253       
00254       old_error = error; error = 0;
00255 
00256       for (i=0; i < num_clusters; i++) {
00257         tmp_centroids[i].SetZero();
00258         counts[i] = 0;
00259       }
00260       
00261      for (i=0; i<num_points; i++) {
00262        
00263        // Find the cluster closest to this point and update its label
00264        double min_distance = DBL_MAX;
00265        Vector data_i_Vec;
00266        data.MakeColumnVector(i, &data_i_Vec);
00267        
00268        for (j=0; j<num_clusters; j++ ) {
00269          double distance = la::DistanceSqEuclidean(data_i_Vec, centroids[j]);
00270          if (distance < min_distance) {
00271            labels[i] = j;
00272            min_distance = distance;
00273          }
00274        }
00275        
00276        // Accumulate the stats for the new centroid of the target cluster
00277        la::AddTo(data_i_Vec, &(tmp_centroids[labels[i]]));
00278        counts[labels[i]]++;
00279        error += min_distance;
00280      }
00281      
00282      // Now update all the centroids
00283      for (int j=0; j < num_clusters; j++) {
00284        if (counts[j] > 0)
00285          la::ScaleOverwrite((1/(double)counts[j]), 
00286                             tmp_centroids[j], &(centroids[j]));
00287      }
00288      num_iter++;
00289      
00290     } while ((fabs(error - old_error) > error_thresh)
00291              && (num_iter < max_iter));
00292     
00293     return true;
00294 
00295   }
00296   
00297   void mat2arrlst(Matrix& a, ArrayList<Vector> * seqs) {
00298     int n = a.n_cols();
00299     ArrayList<Vector> & s_ = *seqs;
00300     s_.Init();
00301     for (int i = 0; i < n; i++) {
00302       Vector seq;
00303       a.MakeColumnVector(i, &seq);
00304       s_.PushBackCopy(seq);
00305     }
00306   }
00307 
00308   void mat2arrlstmat(int N, Matrix& a, ArrayList<Matrix> * seqs) {
00309     int n = a.n_cols();
00310     ArrayList<Matrix>& s_ = *seqs;
00311     s_.Init();
00312     for (int i = 0; i < n; i+=N) {
00313       Matrix b;
00314       a.MakeColumnSlice(i, N, &b);
00315       s_.PushBackCopy(b);
00316     }
00317   }
00318 
00319   bool skip_blank(TextLineReader& reader) {
00320     for (;;){
00321       if (!reader.MoreLines()) return false;
00322       char* pos = reader.Peek().begin();
00323       while (*pos == ' ' || *pos == ',' || *pos == '\t')
00324         pos++;
00325       if (*pos == '\0' || *pos == '%') reader.Gobble();
00326       else break;
00327     }
00328     return true;
00329   }
00330 
00331  success_t read_matrix(TextLineReader& reader, Matrix* matrix) {
00332    if (!skip_blank(reader)) { // EOF ?
00333      matrix->Init(0,0);
00334      return SUCCESS_FAIL;
00335    }
00336    else {
00337      int n_rows = 0;
00338      int n_cols = 0;
00339      bool is_done;
00340      {// How many columns ?
00341        ArrayList<String> num_str;
00342        num_str.Init();
00343        reader.Peek().Split(", \t", &num_str);
00344        n_cols = num_str.size();
00345      }
00346      ArrayList<double> num_double;
00347      num_double.Init();
00348 
00349      for(;;) { // read each rows
00350        n_rows++;
00351        // deprecated: double* point = num_double.AddBack(n_cols);
00352        ArrayList<String> num_str;
00353        num_str.Init();
00354        reader.Peek().Split(", \t", &num_str);
00355 
00356        DEBUG_ASSERT(num_str.size() == n_cols);
00357 
00358        for (int i = 0; i < n_cols; i++)
00359          num_double.PushBackCopy(strtod(num_str[i], NULL));
00360 
00361        is_done = false;
00362 
00363        reader.Gobble();
00364 
00365        for (;;){
00366          if (!reader.MoreLines()) {
00367            is_done = true;
00368            break;
00369          }
00370          char* pos = reader.Peek().begin();
00371          while (*pos == ' ' || *pos == '\t')
00372            pos++;
00373          if (*pos == '\0') reader.Gobble();
00374          else if (*pos == '%') {
00375            is_done = true;
00376            break;
00377          }
00378          else break;
00379        }
00380 
00381        if (is_done) {
00382          num_double.Trim();
00383          matrix->Own(num_double.ReleasePtr(), n_cols, n_rows);
00384          return SUCCESS_PASS;
00385        }
00386      }
00387    }
00388  }
00389 
00390   success_t read_vector(TextLineReader& reader, Vector* vec) {
00391     if (!skip_blank(reader)) { // EOF ?
00392       vec->Init(0);
00393       return SUCCESS_FAIL;
00394     }
00395     else {
00396       ArrayList<double> num_double;
00397       num_double.Init();
00398 
00399       for(;;) { // read each rows
00400         bool is_done = false;
00401 
00402         ArrayList<String> num_str;
00403         num_str.Init();
00404         reader.Peek().Split(", \t", &num_str);
00405 
00406         // deprecated: double* point = num_double.AddBack(num_str.size());
00407 
00408         for (int i = 0; i < num_str.size(); i++)
00409           num_double.PushBackCopy(strtod(num_str[i], NULL));
00410 
00411         reader.Gobble();
00412 
00413         for (;;){
00414           if (!reader.MoreLines()) {
00415             is_done = true;
00416             break;
00417           }
00418           char* pos = reader.Peek().begin();
00419           while (*pos == ' ' || *pos == '\t')
00420             pos++;
00421           if (*pos == '\0') reader.Gobble();
00422           else if (*pos == '%') {
00423             is_done = true;
00424             break;
00425           }
00426           else break;
00427         }
00428 
00429         if (is_done) {
00430           num_double.Trim();
00431           int length = num_double.size();
00432           vec->Own(num_double.ReleasePtr(), length);
00433           return SUCCESS_PASS;
00434         }
00435       }
00436     }
00437   }
00438 
00439   success_t load_matrix_list(const char* filename, ArrayList<Matrix> *matlst) {
00440     TextLineReader reader;
00441     matlst->Init();
00442     if (!PASSED(reader.Open(filename))) return SUCCESS_FAIL;
00443     do {
00444       Matrix tmp;
00445       if (read_matrix(reader, &tmp) == SUCCESS_PASS) {
00446         matlst->PushBackCopy(tmp);
00447       }
00448       else break;
00449     } while (1);
00450     return SUCCESS_PASS;
00451   }
00452 
00453   success_t load_vector_list(const char* filename, ArrayList<Vector> *veclst) {
00454     TextLineReader reader;
00455     veclst->Init();
00456     if (!PASSED(reader.Open(filename))) return SUCCESS_FAIL;
00457     do {
00458       Vector vec;
00459       if (read_vector(reader, &vec) == SUCCESS_PASS) {
00460         veclst->PushBackCopy(vec);
00461       }
00462       else break;
00463     } while (1);
00464     return SUCCESS_PASS;
00465   }
00466 } // end namespace
Generated on Mon Jan 24 12:04:38 2011 for FASTlib by  doxygen 1.6.3