00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 #include "fastlib/fastlib.h"
00033 #include "support.h"
00034 
00035 namespace hmm_support {
00036 
00037   double RAND_UNIFORM_01() {
00038     return (double) rand() / (double)RAND_MAX;
00039   }
00040 
00041   double RAND_UNIFORM(double a, double b) {
00042     return RAND_UNIFORM_01() * (b-a) + a;
00043   }
00044 
00045   void print_matrix(const Matrix& a, const char* msg) {
00046     printf("%s - Matrix (%d x %d) = \n", msg, a.n_rows(), a.n_cols());
00047     for (int i = 0; i < a.n_rows(); i++) {
00048       for (int j = 0; j < a.n_cols(); j++)
00049         printf("%8.4f", a.get(i, j));
00050       printf("\n");
00051     }
00052   }
00053 
00054   void print_matrix(TextWriter& writer, const Matrix& a, const char* msg, const char* format) {
00055     writer.Printf("%s - Matrix (%d x %d) = \n", msg, a.n_rows(), a.n_cols());
00056     for (int j = 0; j < a.n_cols(); j++) {
00057       for (int i = 0; i < a.n_rows(); i++)
00058         writer.Printf(format, a.get(i, j));
00059       writer.Printf("\n");
00060     }
00061   }
00062 
00063   void print_vector(const Vector& a, const char* msg) {
00064     printf("%s - Vector (%d) = \n", msg, a.length());
00065     for (int i = 0; i < a.length(); i++)
00066       printf("%8.4f", a[i]);
00067     printf("\n");
00068   }
00069 
00070   void print_vector(TextWriter& writer, const Vector& a, const char* msg, const char* format) {
00071     writer.Printf("%s - Vector (%d) = \n", msg, a.length());
00072     for (int i = 0; i < a.length(); i++)
00073       writer.Printf(format, a[i]);
00074     writer.Printf("\n");
00075   }
00076 
00077   double RAND_NORMAL_01() {
00078     double r = 2, u, v;
00079     while (r > 1) {
00080       u = RAND_UNIFORM(-1, 1);
00081       v = RAND_UNIFORM(-1, 1);
00082       r = u*u+v*v;
00083     }
00084     return sqrt(-2*log(r)/r)*u;
00085   }
00086 
00087   void RAND_NORMAL_01_INIT(int N, Vector* v) {
00088     double r, u, t;
00089     Vector& v_ = *v;
00090     v_.Init(N);
00091     for (int i = 0; i < N; i+=2) {
00092       r = 2;
00093       while (r > 1) {
00094         u = RAND_UNIFORM(-1, 1);
00095         t = RAND_UNIFORM(-1, 1);
00096         r = u*u+t*t;
00097       }
00098       v_[i] = sqrt(-2*log(r)/r)*u;
00099       if (i+1 < N) v_[i+1] = sqrt(-2*log(r)/r)*t;
00100     }
00101   }
00102 
00103   void RAND_NORMAL_INIT(const Vector& mean, const Matrix& cov, Vector* v) {
00104     int N = mean.length();
00105     Vector v01;
00106     RAND_NORMAL_01_INIT(N, &v01);
00107     la::MulInit(cov, v01, v);
00108     la::AddTo(mean, v);
00109   }
00110 
00111   
00112   double MyMulExpert(const Vector& x, const Matrix& A, const Vector& y) {
00113     int M = A.n_rows();
00114     int N = A.n_cols();
00115     DEBUG_ASSERT_MSG((M==x.length() && N==y.length()), "MyMulExpert: sizes do not match");
00116 
00117     double s = 0;
00118     for (int i = 0; i < M; i++)
00119       for (int j = 0; j < N; j++)
00120         s += x[i] * A.get(i, j) * y[j];
00121     return s;
00122   }
00123 
00124   double NORMAL_DENSITY(const Vector& x, const Vector& mean, const Matrix& inv_cov, double det_cov) {
00125     Vector d;
00126     la::SubInit(x, mean, &d);
00127     return det_cov * exp(-0.5*MyMulExpert(d, inv_cov, d));
00128   }
00129 
00130   bool kmeans(const ArrayList<Matrix>& data, int num_clusters, 
00131               ArrayList<int> *labels_, ArrayList<Vector> *centroids_, 
00132               int max_iter, double error_thresh)
00133   {
00134     ArrayList<int> counts; 
00135     ArrayList<Vector> tmp_centroids;
00136     int num_points, num_dims;
00137     int i, j, num_iter=0;
00138     double error, old_error;
00139 
00140     
00141     ArrayList<int> &labels = *labels_;
00142     ArrayList<Vector> ¢roids = *centroids_;
00143 
00144     num_points = 0;
00145     for (int i = 0; i < data.size(); i++)
00146       num_points+=data[i].n_cols();
00147     if (num_points < num_clusters) 
00148       return false;
00149 
00150     num_dims = data[0].n_rows();
00151 
00152     centroids.Init(num_clusters);
00153     tmp_centroids.Init(num_clusters);
00154 
00155     counts.Init(num_clusters);
00156     labels.Init(num_points);
00157 
00158     
00159     for (j=0; j < num_clusters; j++) {
00160       Vector temp_vector;
00161       int i = (int) math::Random(0, data.size() - 0.5);
00162       int k = (int) math::Random(0, data[i].n_cols()-0.5);
00163       data[i].MakeColumnVector(k, &temp_vector);
00164       centroids[j].Copy(temp_vector);
00165       tmp_centroids[j].Init(num_dims);
00166     }
00167 
00168     error = DBL_MAX;
00169 
00170     do {
00171 
00172       old_error = error; error = 0;
00173 
00174       for (i=0; i < num_clusters; i++) {
00175         tmp_centroids[i].SetZero();
00176             counts[i] = 0;
00177       }
00178       i = 0;
00179       for (int t=0, i = 0; t<data.size(); t++) 
00180       for (int k = 0; k < data[t].n_cols(); k++, i++) {
00181 
00182         
00183         double min_distance = DBL_MAX;
00184         Vector data_i_Vec;
00185         data[t].MakeColumnVector(k, &data_i_Vec);
00186 
00187         for (j=0; j<num_clusters; j++ ) {
00188           double distance = la::DistanceSqEuclidean(data_i_Vec, centroids[j]);
00189           if (distance < min_distance) {
00190             labels[i] = j;
00191             min_distance = distance;
00192           }
00193         }
00194 
00195         
00196         la::AddTo(data_i_Vec, &(tmp_centroids[labels[i]]));
00197         counts[labels[i]]++;
00198         error += min_distance;
00199       }
00200 
00201       
00202       for (int j=0; j < num_clusters; j++) {
00203         if (counts[j] > 0)
00204           la::ScaleOverwrite((1/(double)counts[j]), 
00205                              tmp_centroids[j], &(centroids[j]));
00206       }
00207       num_iter++;
00208 
00209     } while ((fabs(error - old_error) > error_thresh)
00210              && (num_iter < max_iter));
00211 
00212     return true;
00213 
00214   }
00215 
00216   bool kmeans(Matrix const &data, int num_clusters, 
00217               ArrayList<int> *labels_, ArrayList<Vector> *centroids_, 
00218               int max_iter, double error_thresh)
00219   {
00220     ArrayList<int> counts; 
00221     ArrayList<Vector> tmp_centroids;
00222     int num_points, num_dims;
00223     int i, j, num_iter=0;
00224     double error, old_error;
00225     
00226     
00227     ArrayList<int> &labels = *labels_;
00228     ArrayList<Vector> ¢roids = *centroids_;
00229     
00230     if (data.n_cols() < num_clusters) 
00231       return false;
00232     
00233     num_points = data.n_cols();
00234     num_dims = data.n_rows();
00235     
00236     centroids.Init(num_clusters);
00237     tmp_centroids.Init(num_clusters);
00238     
00239     counts.Init(num_clusters);
00240     labels.Init(num_points);
00241     
00242     
00243     for (i=0, j=0; j < num_clusters; i+=num_points/num_clusters, j++) {
00244       Vector temp_vector;
00245       data.MakeColumnVector(i, &temp_vector);
00246       centroids[j].Copy(temp_vector);
00247       tmp_centroids[j].Init(num_dims);
00248     }
00249     
00250     error = DBL_MAX;
00251     
00252     do {
00253       
00254       old_error = error; error = 0;
00255 
00256       for (i=0; i < num_clusters; i++) {
00257         tmp_centroids[i].SetZero();
00258         counts[i] = 0;
00259       }
00260       
00261      for (i=0; i<num_points; i++) {
00262        
00263        
00264        double min_distance = DBL_MAX;
00265        Vector data_i_Vec;
00266        data.MakeColumnVector(i, &data_i_Vec);
00267        
00268        for (j=0; j<num_clusters; j++ ) {
00269          double distance = la::DistanceSqEuclidean(data_i_Vec, centroids[j]);
00270          if (distance < min_distance) {
00271            labels[i] = j;
00272            min_distance = distance;
00273          }
00274        }
00275        
00276        
00277        la::AddTo(data_i_Vec, &(tmp_centroids[labels[i]]));
00278        counts[labels[i]]++;
00279        error += min_distance;
00280      }
00281      
00282      
00283      for (int j=0; j < num_clusters; j++) {
00284        if (counts[j] > 0)
00285          la::ScaleOverwrite((1/(double)counts[j]), 
00286                             tmp_centroids[j], &(centroids[j]));
00287      }
00288      num_iter++;
00289      
00290     } while ((fabs(error - old_error) > error_thresh)
00291              && (num_iter < max_iter));
00292     
00293     return true;
00294 
00295   }
00296   
00297   void mat2arrlst(Matrix& a, ArrayList<Vector> * seqs) {
00298     int n = a.n_cols();
00299     ArrayList<Vector> & s_ = *seqs;
00300     s_.Init();
00301     for (int i = 0; i < n; i++) {
00302       Vector seq;
00303       a.MakeColumnVector(i, &seq);
00304       s_.PushBackCopy(seq);
00305     }
00306   }
00307 
00308   void mat2arrlstmat(int N, Matrix& a, ArrayList<Matrix> * seqs) {
00309     int n = a.n_cols();
00310     ArrayList<Matrix>& s_ = *seqs;
00311     s_.Init();
00312     for (int i = 0; i < n; i+=N) {
00313       Matrix b;
00314       a.MakeColumnSlice(i, N, &b);
00315       s_.PushBackCopy(b);
00316     }
00317   }
00318 
00319   bool skip_blank(TextLineReader& reader) {
00320     for (;;){
00321       if (!reader.MoreLines()) return false;
00322       char* pos = reader.Peek().begin();
00323       while (*pos == ' ' || *pos == ',' || *pos == '\t')
00324         pos++;
00325       if (*pos == '\0' || *pos == '%') reader.Gobble();
00326       else break;
00327     }
00328     return true;
00329   }
00330 
00331  success_t read_matrix(TextLineReader& reader, Matrix* matrix) {
00332    if (!skip_blank(reader)) { 
00333      matrix->Init(0,0);
00334      return SUCCESS_FAIL;
00335    }
00336    else {
00337      int n_rows = 0;
00338      int n_cols = 0;
00339      bool is_done;
00340      {
00341        ArrayList<String> num_str;
00342        num_str.Init();
00343        reader.Peek().Split(", \t", &num_str);
00344        n_cols = num_str.size();
00345      }
00346      ArrayList<double> num_double;
00347      num_double.Init();
00348 
00349      for(;;) { 
00350        n_rows++;
00351        
00352        ArrayList<String> num_str;
00353        num_str.Init();
00354        reader.Peek().Split(", \t", &num_str);
00355 
00356        DEBUG_ASSERT(num_str.size() == n_cols);
00357 
00358        for (int i = 0; i < n_cols; i++)
00359          num_double.PushBackCopy(strtod(num_str[i], NULL));
00360 
00361        is_done = false;
00362 
00363        reader.Gobble();
00364 
00365        for (;;){
00366          if (!reader.MoreLines()) {
00367            is_done = true;
00368            break;
00369          }
00370          char* pos = reader.Peek().begin();
00371          while (*pos == ' ' || *pos == '\t')
00372            pos++;
00373          if (*pos == '\0') reader.Gobble();
00374          else if (*pos == '%') {
00375            is_done = true;
00376            break;
00377          }
00378          else break;
00379        }
00380 
00381        if (is_done) {
00382          num_double.Trim();
00383          matrix->Own(num_double.ReleasePtr(), n_cols, n_rows);
00384          return SUCCESS_PASS;
00385        }
00386      }
00387    }
00388  }
00389 
00390   success_t read_vector(TextLineReader& reader, Vector* vec) {
00391     if (!skip_blank(reader)) { 
00392       vec->Init(0);
00393       return SUCCESS_FAIL;
00394     }
00395     else {
00396       ArrayList<double> num_double;
00397       num_double.Init();
00398 
00399       for(;;) { 
00400         bool is_done = false;
00401 
00402         ArrayList<String> num_str;
00403         num_str.Init();
00404         reader.Peek().Split(", \t", &num_str);
00405 
00406         
00407 
00408         for (int i = 0; i < num_str.size(); i++)
00409           num_double.PushBackCopy(strtod(num_str[i], NULL));
00410 
00411         reader.Gobble();
00412 
00413         for (;;){
00414           if (!reader.MoreLines()) {
00415             is_done = true;
00416             break;
00417           }
00418           char* pos = reader.Peek().begin();
00419           while (*pos == ' ' || *pos == '\t')
00420             pos++;
00421           if (*pos == '\0') reader.Gobble();
00422           else if (*pos == '%') {
00423             is_done = true;
00424             break;
00425           }
00426           else break;
00427         }
00428 
00429         if (is_done) {
00430           num_double.Trim();
00431           int length = num_double.size();
00432           vec->Own(num_double.ReleasePtr(), length);
00433           return SUCCESS_PASS;
00434         }
00435       }
00436     }
00437   }
00438 
00439   success_t load_matrix_list(const char* filename, ArrayList<Matrix> *matlst) {
00440     TextLineReader reader;
00441     matlst->Init();
00442     if (!PASSED(reader.Open(filename))) return SUCCESS_FAIL;
00443     do {
00444       Matrix tmp;
00445       if (read_matrix(reader, &tmp) == SUCCESS_PASS) {
00446         matlst->PushBackCopy(tmp);
00447       }
00448       else break;
00449     } while (1);
00450     return SUCCESS_PASS;
00451   }
00452 
00453   success_t load_vector_list(const char* filename, ArrayList<Vector> *veclst) {
00454     TextLineReader reader;
00455     veclst->Init();
00456     if (!PASSED(reader.Open(filename))) return SUCCESS_FAIL;
00457     do {
00458       Vector vec;
00459       if (read_vector(reader, &vec) == SUCCESS_PASS) {
00460         veclst->PushBackCopy(vec);
00461       }
00462       else break;
00463     } while (1);
00464     return SUCCESS_PASS;
00465   }
00466 }