00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #include "fastlib/fastlib.h"
00033 #include "support.h"
00034
00035 namespace hmm_support {
00036
00037 double RAND_UNIFORM_01() {
00038 return (double) rand() / (double)RAND_MAX;
00039 }
00040
00041 double RAND_UNIFORM(double a, double b) {
00042 return RAND_UNIFORM_01() * (b-a) + a;
00043 }
00044
00045 void print_matrix(const Matrix& a, const char* msg) {
00046 printf("%s - Matrix (%d x %d) = \n", msg, a.n_rows(), a.n_cols());
00047 for (int i = 0; i < a.n_rows(); i++) {
00048 for (int j = 0; j < a.n_cols(); j++)
00049 printf("%8.4f", a.get(i, j));
00050 printf("\n");
00051 }
00052 }
00053
00054 void print_matrix(TextWriter& writer, const Matrix& a, const char* msg, const char* format) {
00055 writer.Printf("%s - Matrix (%d x %d) = \n", msg, a.n_rows(), a.n_cols());
00056 for (int j = 0; j < a.n_cols(); j++) {
00057 for (int i = 0; i < a.n_rows(); i++)
00058 writer.Printf(format, a.get(i, j));
00059 writer.Printf("\n");
00060 }
00061 }
00062
00063 void print_vector(const Vector& a, const char* msg) {
00064 printf("%s - Vector (%d) = \n", msg, a.length());
00065 for (int i = 0; i < a.length(); i++)
00066 printf("%8.4f", a[i]);
00067 printf("\n");
00068 }
00069
00070 void print_vector(TextWriter& writer, const Vector& a, const char* msg, const char* format) {
00071 writer.Printf("%s - Vector (%d) = \n", msg, a.length());
00072 for (int i = 0; i < a.length(); i++)
00073 writer.Printf(format, a[i]);
00074 writer.Printf("\n");
00075 }
00076
00077 double RAND_NORMAL_01() {
00078 double r = 2, u, v;
00079 while (r > 1) {
00080 u = RAND_UNIFORM(-1, 1);
00081 v = RAND_UNIFORM(-1, 1);
00082 r = u*u+v*v;
00083 }
00084 return sqrt(-2*log(r)/r)*u;
00085 }
00086
00087 void RAND_NORMAL_01_INIT(int N, Vector* v) {
00088 double r, u, t;
00089 Vector& v_ = *v;
00090 v_.Init(N);
00091 for (int i = 0; i < N; i+=2) {
00092 r = 2;
00093 while (r > 1) {
00094 u = RAND_UNIFORM(-1, 1);
00095 t = RAND_UNIFORM(-1, 1);
00096 r = u*u+t*t;
00097 }
00098 v_[i] = sqrt(-2*log(r)/r)*u;
00099 if (i+1 < N) v_[i+1] = sqrt(-2*log(r)/r)*t;
00100 }
00101 }
00102
00103 void RAND_NORMAL_INIT(const Vector& mean, const Matrix& cov, Vector* v) {
00104 int N = mean.length();
00105 Vector v01;
00106 RAND_NORMAL_01_INIT(N, &v01);
00107 la::MulInit(cov, v01, v);
00108 la::AddTo(mean, v);
00109 }
00110
00111
00112 double MyMulExpert(const Vector& x, const Matrix& A, const Vector& y) {
00113 int M = A.n_rows();
00114 int N = A.n_cols();
00115 DEBUG_ASSERT_MSG((M==x.length() && N==y.length()), "MyMulExpert: sizes do not match");
00116
00117 double s = 0;
00118 for (int i = 0; i < M; i++)
00119 for (int j = 0; j < N; j++)
00120 s += x[i] * A.get(i, j) * y[j];
00121 return s;
00122 }
00123
00124 double NORMAL_DENSITY(const Vector& x, const Vector& mean, const Matrix& inv_cov, double det_cov) {
00125 Vector d;
00126 la::SubInit(x, mean, &d);
00127 return det_cov * exp(-0.5*MyMulExpert(d, inv_cov, d));
00128 }
00129
00130 bool kmeans(const ArrayList<Matrix>& data, int num_clusters,
00131 ArrayList<int> *labels_, ArrayList<Vector> *centroids_,
00132 int max_iter, double error_thresh)
00133 {
00134 ArrayList<int> counts;
00135 ArrayList<Vector> tmp_centroids;
00136 int num_points, num_dims;
00137 int i, j, num_iter=0;
00138 double error, old_error;
00139
00140
00141 ArrayList<int> &labels = *labels_;
00142 ArrayList<Vector> ¢roids = *centroids_;
00143
00144 num_points = 0;
00145 for (int i = 0; i < data.size(); i++)
00146 num_points+=data[i].n_cols();
00147 if (num_points < num_clusters)
00148 return false;
00149
00150 num_dims = data[0].n_rows();
00151
00152 centroids.Init(num_clusters);
00153 tmp_centroids.Init(num_clusters);
00154
00155 counts.Init(num_clusters);
00156 labels.Init(num_points);
00157
00158
00159 for (j=0; j < num_clusters; j++) {
00160 Vector temp_vector;
00161 int i = (int) math::Random(0, data.size() - 0.5);
00162 int k = (int) math::Random(0, data[i].n_cols()-0.5);
00163 data[i].MakeColumnVector(k, &temp_vector);
00164 centroids[j].Copy(temp_vector);
00165 tmp_centroids[j].Init(num_dims);
00166 }
00167
00168 error = DBL_MAX;
00169
00170 do {
00171
00172 old_error = error; error = 0;
00173
00174 for (i=0; i < num_clusters; i++) {
00175 tmp_centroids[i].SetZero();
00176 counts[i] = 0;
00177 }
00178 i = 0;
00179 for (int t=0, i = 0; t<data.size(); t++)
00180 for (int k = 0; k < data[t].n_cols(); k++, i++) {
00181
00182
00183 double min_distance = DBL_MAX;
00184 Vector data_i_Vec;
00185 data[t].MakeColumnVector(k, &data_i_Vec);
00186
00187 for (j=0; j<num_clusters; j++ ) {
00188 double distance = la::DistanceSqEuclidean(data_i_Vec, centroids[j]);
00189 if (distance < min_distance) {
00190 labels[i] = j;
00191 min_distance = distance;
00192 }
00193 }
00194
00195
00196 la::AddTo(data_i_Vec, &(tmp_centroids[labels[i]]));
00197 counts[labels[i]]++;
00198 error += min_distance;
00199 }
00200
00201
00202 for (int j=0; j < num_clusters; j++) {
00203 if (counts[j] > 0)
00204 la::ScaleOverwrite((1/(double)counts[j]),
00205 tmp_centroids[j], &(centroids[j]));
00206 }
00207 num_iter++;
00208
00209 } while ((fabs(error - old_error) > error_thresh)
00210 && (num_iter < max_iter));
00211
00212 return true;
00213
00214 }
00215
00216 bool kmeans(Matrix const &data, int num_clusters,
00217 ArrayList<int> *labels_, ArrayList<Vector> *centroids_,
00218 int max_iter, double error_thresh)
00219 {
00220 ArrayList<int> counts;
00221 ArrayList<Vector> tmp_centroids;
00222 int num_points, num_dims;
00223 int i, j, num_iter=0;
00224 double error, old_error;
00225
00226
00227 ArrayList<int> &labels = *labels_;
00228 ArrayList<Vector> ¢roids = *centroids_;
00229
00230 if (data.n_cols() < num_clusters)
00231 return false;
00232
00233 num_points = data.n_cols();
00234 num_dims = data.n_rows();
00235
00236 centroids.Init(num_clusters);
00237 tmp_centroids.Init(num_clusters);
00238
00239 counts.Init(num_clusters);
00240 labels.Init(num_points);
00241
00242
00243 for (i=0, j=0; j < num_clusters; i+=num_points/num_clusters, j++) {
00244 Vector temp_vector;
00245 data.MakeColumnVector(i, &temp_vector);
00246 centroids[j].Copy(temp_vector);
00247 tmp_centroids[j].Init(num_dims);
00248 }
00249
00250 error = DBL_MAX;
00251
00252 do {
00253
00254 old_error = error; error = 0;
00255
00256 for (i=0; i < num_clusters; i++) {
00257 tmp_centroids[i].SetZero();
00258 counts[i] = 0;
00259 }
00260
00261 for (i=0; i<num_points; i++) {
00262
00263
00264 double min_distance = DBL_MAX;
00265 Vector data_i_Vec;
00266 data.MakeColumnVector(i, &data_i_Vec);
00267
00268 for (j=0; j<num_clusters; j++ ) {
00269 double distance = la::DistanceSqEuclidean(data_i_Vec, centroids[j]);
00270 if (distance < min_distance) {
00271 labels[i] = j;
00272 min_distance = distance;
00273 }
00274 }
00275
00276
00277 la::AddTo(data_i_Vec, &(tmp_centroids[labels[i]]));
00278 counts[labels[i]]++;
00279 error += min_distance;
00280 }
00281
00282
00283 for (int j=0; j < num_clusters; j++) {
00284 if (counts[j] > 0)
00285 la::ScaleOverwrite((1/(double)counts[j]),
00286 tmp_centroids[j], &(centroids[j]));
00287 }
00288 num_iter++;
00289
00290 } while ((fabs(error - old_error) > error_thresh)
00291 && (num_iter < max_iter));
00292
00293 return true;
00294
00295 }
00296
00297 void mat2arrlst(Matrix& a, ArrayList<Vector> * seqs) {
00298 int n = a.n_cols();
00299 ArrayList<Vector> & s_ = *seqs;
00300 s_.Init();
00301 for (int i = 0; i < n; i++) {
00302 Vector seq;
00303 a.MakeColumnVector(i, &seq);
00304 s_.PushBackCopy(seq);
00305 }
00306 }
00307
00308 void mat2arrlstmat(int N, Matrix& a, ArrayList<Matrix> * seqs) {
00309 int n = a.n_cols();
00310 ArrayList<Matrix>& s_ = *seqs;
00311 s_.Init();
00312 for (int i = 0; i < n; i+=N) {
00313 Matrix b;
00314 a.MakeColumnSlice(i, N, &b);
00315 s_.PushBackCopy(b);
00316 }
00317 }
00318
00319 bool skip_blank(TextLineReader& reader) {
00320 for (;;){
00321 if (!reader.MoreLines()) return false;
00322 char* pos = reader.Peek().begin();
00323 while (*pos == ' ' || *pos == ',' || *pos == '\t')
00324 pos++;
00325 if (*pos == '\0' || *pos == '%') reader.Gobble();
00326 else break;
00327 }
00328 return true;
00329 }
00330
00331 success_t read_matrix(TextLineReader& reader, Matrix* matrix) {
00332 if (!skip_blank(reader)) {
00333 matrix->Init(0,0);
00334 return SUCCESS_FAIL;
00335 }
00336 else {
00337 int n_rows = 0;
00338 int n_cols = 0;
00339 bool is_done;
00340 {
00341 ArrayList<String> num_str;
00342 num_str.Init();
00343 reader.Peek().Split(", \t", &num_str);
00344 n_cols = num_str.size();
00345 }
00346 ArrayList<double> num_double;
00347 num_double.Init();
00348
00349 for(;;) {
00350 n_rows++;
00351
00352 ArrayList<String> num_str;
00353 num_str.Init();
00354 reader.Peek().Split(", \t", &num_str);
00355
00356 DEBUG_ASSERT(num_str.size() == n_cols);
00357
00358 for (int i = 0; i < n_cols; i++)
00359 num_double.PushBackCopy(strtod(num_str[i], NULL));
00360
00361 is_done = false;
00362
00363 reader.Gobble();
00364
00365 for (;;){
00366 if (!reader.MoreLines()) {
00367 is_done = true;
00368 break;
00369 }
00370 char* pos = reader.Peek().begin();
00371 while (*pos == ' ' || *pos == '\t')
00372 pos++;
00373 if (*pos == '\0') reader.Gobble();
00374 else if (*pos == '%') {
00375 is_done = true;
00376 break;
00377 }
00378 else break;
00379 }
00380
00381 if (is_done) {
00382 num_double.Trim();
00383 matrix->Own(num_double.ReleasePtr(), n_cols, n_rows);
00384 return SUCCESS_PASS;
00385 }
00386 }
00387 }
00388 }
00389
00390 success_t read_vector(TextLineReader& reader, Vector* vec) {
00391 if (!skip_blank(reader)) {
00392 vec->Init(0);
00393 return SUCCESS_FAIL;
00394 }
00395 else {
00396 ArrayList<double> num_double;
00397 num_double.Init();
00398
00399 for(;;) {
00400 bool is_done = false;
00401
00402 ArrayList<String> num_str;
00403 num_str.Init();
00404 reader.Peek().Split(", \t", &num_str);
00405
00406
00407
00408 for (int i = 0; i < num_str.size(); i++)
00409 num_double.PushBackCopy(strtod(num_str[i], NULL));
00410
00411 reader.Gobble();
00412
00413 for (;;){
00414 if (!reader.MoreLines()) {
00415 is_done = true;
00416 break;
00417 }
00418 char* pos = reader.Peek().begin();
00419 while (*pos == ' ' || *pos == '\t')
00420 pos++;
00421 if (*pos == '\0') reader.Gobble();
00422 else if (*pos == '%') {
00423 is_done = true;
00424 break;
00425 }
00426 else break;
00427 }
00428
00429 if (is_done) {
00430 num_double.Trim();
00431 int length = num_double.size();
00432 vec->Own(num_double.ReleasePtr(), length);
00433 return SUCCESS_PASS;
00434 }
00435 }
00436 }
00437 }
00438
00439 success_t load_matrix_list(const char* filename, ArrayList<Matrix> *matlst) {
00440 TextLineReader reader;
00441 matlst->Init();
00442 if (!PASSED(reader.Open(filename))) return SUCCESS_FAIL;
00443 do {
00444 Matrix tmp;
00445 if (read_matrix(reader, &tmp) == SUCCESS_PASS) {
00446 matlst->PushBackCopy(tmp);
00447 }
00448 else break;
00449 } while (1);
00450 return SUCCESS_PASS;
00451 }
00452
00453 success_t load_vector_list(const char* filename, ArrayList<Vector> *veclst) {
00454 TextLineReader reader;
00455 veclst->Init();
00456 if (!PASSED(reader.Open(filename))) return SUCCESS_FAIL;
00457 do {
00458 Vector vec;
00459 if (read_vector(reader, &vec) == SUCCESS_PASS) {
00460 veclst->PushBackCopy(vec);
00461 }
00462 else break;
00463 } while (1);
00464 return SUCCESS_PASS;
00465 }
00466 }