00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00103 #ifndef ORIGINAL_IFGT_H
00104 #define ORIGINAL_IFGT_H
00105
00106 #include "fastlib/fastlib.h"
00107
00108 class OriginalIFGT {
00109
00110 private:
00111
00113
00115 struct datanode *module_;
00116
00118 int dim_;
00119
00121 int num_reference_points_;
00122
00124 Matrix query_set_;
00125
00127 Matrix reference_set_;
00128
00130 Vector reference_weights_;
00131
00133 double bandwidth_;
00134
00136 double bandwidth_sq_;
00137
00144 double bandwidth_factor_;
00145
00147 double epsilon_;
00148
00150 int pterms_;
00151
00153 int total_num_coeffs_;
00154
00156 Matrix weighted_coeffs_;
00157
00159 Matrix unweighted_coeffs_;
00160
00162 int num_cluster_desired_;
00163
00169 double cut_off_radius_;
00170
00173 double max_radius_cluster_;
00174
00177 Matrix cluster_centers_;
00178
00182 ArrayList<int> index_during_clustering_;
00183
00187 ArrayList<int> cluster_index_;
00188
00192 Vector cluster_radii_;
00193
00196 ArrayList<int> num_reference_points_in_cluster_;
00197
00200 Vector densities_;
00201
00203 void TaylorExpansion() {
00204
00205 Vector tmp_coeffs;
00206 tmp_coeffs.Init(total_num_coeffs_);
00207
00208 ComputeUnweightedCoeffs_(tmp_coeffs);
00209
00210 ComputeWeightedCoeffs_(tmp_coeffs);
00211
00212 return;
00213 }
00214
00215 void ComputeUnweightedCoeffs_(Vector &taylor_coeffs) {
00216
00217 ArrayList<int> heads;
00218 heads.Init(dim_ + 1);
00219 ArrayList<int> cinds;
00220 cinds.Init(total_num_coeffs_);
00221
00222 for (int i = 0; i < dim_; i++) {
00223 heads[i] = 0;
00224 }
00225 heads[dim_] = INT_MAX;
00226
00227 cinds[0] = 0;
00228 taylor_coeffs[0] = 1.0;
00229
00230 for(int k = 1, t = 1, tail = 1; k < pterms_; k++, tail = t) {
00231 for(int i = 0; i < dim_; i++) {
00232 int head = heads[i];
00233 heads[i] = t;
00234 for(int j = head; j < tail; j++, t++) {
00235 cinds[t] = (j < heads[i+1]) ? cinds[j] + 1 : 1;
00236 taylor_coeffs[t] = 2.0 * taylor_coeffs[j];
00237 taylor_coeffs[t] /= (double) cinds[t];
00238 }
00239 }
00240 }
00241 return;
00242 }
00243
00244 void ComputeWeightedCoeffs_(Vector &taylor_coeffs) {
00245
00246 Vector dx;
00247 dx.Init(dim_);
00248 Vector prods;
00249 prods.Init(total_num_coeffs_);
00250 ArrayList<int> heads;
00251 heads.Init(dim_);
00252
00253
00254 weighted_coeffs_.SetZero();
00255 unweighted_coeffs_.SetZero();
00256
00257 for(int n = 0; n < num_reference_points_; n++) {
00258
00259 int ix2c = cluster_index_[n];
00260 double sum = 0.0;
00261
00262 for(int i = 0; i < dim_; i++) {
00263 dx[i] = (reference_set_.get(i, n) - cluster_centers_.get(i, ix2c)) /
00264 bandwidth_factor_;
00265
00266 sum -= dx[i] * dx[i];
00267 heads[i] = 0;
00268 }
00269
00270 prods[0] = exp(sum);
00271 for(int k = 1, t = 1, tail = 1; k < pterms_; k++, tail = t) {
00272
00273 for (int i = 0; i < dim_; i++) {
00274 int head = heads[i];
00275 heads[i] = t;
00276 for(int j = head; j < tail; j++, t++)
00277 prods[t] = dx[i] * prods[j];
00278 }
00279 }
00280
00281
00282 for(int i = 0; i < total_num_coeffs_; i++) {
00283 weighted_coeffs_.set(i, ix2c, weighted_coeffs_.get(i, ix2c) +
00284 reference_weights_[n] * prods[i]);
00285 unweighted_coeffs_.set(i, ix2c, unweighted_coeffs_.get(i, ix2c) +
00286 prods[i]);
00287 }
00288
00289 }
00290
00291
00292 for(int k = 0; k < num_cluster_desired_; k++) {
00293 for(int i = 0; i < total_num_coeffs_; i++) {
00294 weighted_coeffs_.set(i, k, weighted_coeffs_.get(i, k) *
00295 taylor_coeffs[i]);
00296 unweighted_coeffs_.set(i, k, unweighted_coeffs_.get(i, k) *
00297 taylor_coeffs[i]);
00298 }
00299 }
00300 return;
00301 }
00302
00307 double ComputeCenters() {
00308
00309
00310 max_radius_cluster_ = 0;
00311
00312
00313 cluster_centers_.SetZero();
00314
00315
00316 for(int j = 0; j < dim_; j++) {
00317 for(int i = 0; i < num_reference_points_; i++) {
00318
00319 cluster_centers_.set(j, cluster_index_[i],
00320 cluster_centers_.get(j, cluster_index_[i]) +
00321 reference_set_.get(j, i));
00322 }
00323 }
00324
00325 for(int j = 0; j < dim_; j++) {
00326 for(int i = 0; i < num_cluster_desired_; i++) {
00327 cluster_centers_.set(j, i, cluster_centers_.get(j, i) /
00328 num_reference_points_in_cluster_[i]);
00329 }
00330 }
00331
00332
00333 cluster_radii_.SetZero();
00334 for(int i = 0; i < num_reference_points_; i++) {
00335 Vector reference_pt;
00336 reference_set_.MakeColumnVector(i, &reference_pt);
00337
00338
00339 int cluster_id = cluster_index_[i];
00340 Vector center;
00341 cluster_centers_.MakeColumnVector(cluster_id, ¢er);
00342 cluster_radii_[cluster_id] =
00343 std::max(cluster_radii_[cluster_id],
00344 sqrt(la::DistanceSqEuclidean(reference_pt, center)));
00345 max_radius_cluster_ =
00346 std::max(max_radius_cluster_, cluster_radii_[cluster_id]);
00347 }
00348
00349 return max_radius_cluster_;
00350 }
00351
00355 double KCenterClustering() {
00356
00357 Vector distances_to_center;
00358 distances_to_center.Init(num_reference_points_);
00359
00360
00361 srand( (unsigned)time( NULL ) );
00362 int ind = rand() % num_reference_points_;
00363
00364
00365 index_during_clustering_[0] = ind;
00366 Vector first_center;
00367 reference_set_.MakeColumnVector(ind, &first_center);
00368
00369
00370
00371
00372 for(int j = 0; j < num_reference_points_; j++) {
00373 Vector reference_point;
00374 reference_set_.MakeColumnVector(j, &reference_point);
00375
00376 distances_to_center[j] = (j == ind) ?
00377 0.0:la::DistanceSqEuclidean(reference_point, first_center);
00378 cluster_index_[j] = 0;
00379 }
00380
00381
00382 for(int i = 1; i < num_cluster_desired_; i++) {
00383
00384
00385
00386 ind = IndexOfLargestElement(distances_to_center);
00387
00388
00389 index_during_clustering_[i] = ind;
00390
00391
00392 Vector center;
00393 reference_set_.MakeColumnVector(ind, ¢er);
00394
00395 for (int j = 0; j < num_reference_points_; j++) {
00396 Vector reference_point;
00397 reference_set_.MakeColumnVector(j, &reference_point);
00398 double d = (j == ind)?
00399 0.0:la::DistanceSqEuclidean(reference_point, center);
00400
00401 if (d < distances_to_center[j]) {
00402 distances_to_center[j] = d;
00403 cluster_index_[j] = i;
00404 }
00405 }
00406 }
00407
00408
00409 ind = IndexOfLargestElement(distances_to_center);
00410
00411 double radius = distances_to_center[ind];
00412
00413
00414 for(int i = 0; i < num_cluster_desired_; i++) {
00415 num_reference_points_in_cluster_[i] = 0;
00416 }
00417
00418 for (int i = 0; i < num_reference_points_; i++) {
00419 num_reference_points_in_cluster_[cluster_index_[i]]++;
00420 }
00421
00422 return sqrt(radius);
00423 }
00424
00428 int IndexOfLargestElement(const Vector &x) {
00429
00430 int largest_index = 0;
00431 double largest_quantity = -DBL_MAX;
00432
00433 for(int i = 0; i < x.length(); i++) {
00434 if(largest_quantity < x[i]) {
00435 largest_quantity = x[i];
00436 largest_index = i;
00437 }
00438 }
00439 return largest_index;
00440 }
00441
00446 void NormalizeDensities() {
00447 double norm_const = pow(2 * math::PI * bandwidth_sq_,
00448 query_set_.n_rows() / 2.0) *
00449 reference_set_.n_cols();
00450
00451 for(index_t q = 0; q < query_set_.n_cols(); q++) {
00452 densities_[q] /= norm_const;
00453 }
00454 }
00455
00456 void IFGTChooseTruncationNumber_() {
00457
00458 double rx = max_radius_cluster_;
00459 double max_diameter_of_the_datasets = sqrt(dim_);
00460
00461 double two_h_square = bandwidth_factor_ * bandwidth_factor_;
00462
00463 double r = min(max_diameter_of_the_datasets,
00464 bandwidth_factor_ * sqrt(log(1 / epsilon_)));
00465
00466 int p_ul=300;
00467
00468 double rx_square = rx * rx;
00469
00470 double error = 1;
00471 double temp = 1;
00472 int p = 0;
00473 while((error > epsilon_) & (p <= p_ul)) {
00474 p++;
00475 double b = min(((rx + sqrt((rx_square) + (2 * p * two_h_square))) / 2),
00476 rx + r);
00477 double c = rx - b;
00478 temp = temp * (((2 * rx * b) / two_h_square) / p);
00479 error = temp * (exp(-(c * c) / two_h_square));
00480 }
00481
00482
00483 pterms_ = p;
00484
00485
00486 cut_off_radius_ = r;
00487
00488 }
00489
00490 void IFGTChooseParameters_(int max_num_clusters) {
00491
00492
00493
00494 double max_diamater_of_the_datasets = sqrt(dim_);
00495
00496 double two_h_square = bandwidth_factor_ * bandwidth_factor_;
00497
00498
00499 double r = min(max_diamater_of_the_datasets,
00500 bandwidth_factor_ * sqrt(log(1 / epsilon_)));
00501
00502
00503 int p_ul=200;
00504
00505 num_cluster_desired_ = 1;
00506
00507 double complexity_min=1e16;
00508 double rx;
00509
00510 for(int i = 0; i < max_num_clusters; i++){
00511
00512
00513 rx = pow((double) i + 1, -1.0 / (double) dim_);
00514 double rx_square = rx * rx;
00515
00516
00517 double n = std::min(i + 1.0, pow(r / rx, (double) dim_));
00518 double error = 1;
00519 double temp = 1;
00520 int p = 0;
00521
00522
00523 while((error > epsilon_) & (p <= p_ul)) {
00524 p++;
00525 double b =
00526 std::min(((rx + sqrt((rx_square) + (2 * p * two_h_square))) / 2.0),
00527 rx + r);
00528 double c = rx - b;
00529 temp = temp * (((2 * rx * b) / two_h_square) / p);
00530 error = temp * (exp(-(c * c) / two_h_square));
00531 }
00532 double complexity = (i + 1) + log((double) i + 1) +
00533 ((1 + n) * math::BinomialCoefficient(p - 1 + dim_, dim_));
00534
00535 if(complexity < complexity_min) {
00536 complexity_min = complexity;
00537 num_cluster_desired_ = i + 1;
00538 pterms_ = p;
00539 }
00540 }
00541 }
00542
00543 public:
00544
00546
00549 OriginalIFGT() {}
00550
00553 ~OriginalIFGT() {}
00554
00555
00557
00563 void get_density_estimates(Vector *results) {
00564 results->Init(densities_.length());
00565
00566 for(index_t i = 0; i < densities_.length(); i++) {
00567 (*results)[i] = densities_[i];
00568 }
00569 }
00570
00572
00573 void Init(Matrix &queries, Matrix &references, struct datanode *module_in) {
00574
00575
00576 module_ = module_in;
00577
00578
00579 dim_ = references.n_rows();
00580
00581
00582 query_set_.Copy(queries);
00583 reference_set_.Copy(references);
00584 num_reference_points_ = reference_set_.n_cols();
00585
00586
00587 reference_weights_.Init(reference_set_.n_cols());
00588 reference_weights_.SetAll(1);
00589
00590
00591 densities_.Init(query_set_.n_cols());
00592
00593
00594 bandwidth_ = fx_param_double_req(module_, "bandwidth");
00595 bandwidth_sq_ = bandwidth_ * bandwidth_;
00596 bandwidth_factor_ = sqrt(2) * bandwidth_;
00597
00598
00599 epsilon_ = fx_param_double(module_, "absolute_error", 0.1);
00600
00601
00602 int cluster_limit = (int) ceilf(20.0 * sqrt(dim_) / sqrt(bandwidth_));
00603
00604 printf("Automatic parameter selection phase...\n");
00605
00606 printf("Preprocessing phase for the original IFGT...\n");
00607
00608 fx_timer_start(module_, "ifgt_kde_preprocess");
00609 IFGTChooseParameters_(cluster_limit);
00610 printf("Chose %d clusters...\n", num_cluster_desired_);
00611 printf("Tentatively chose %d truncation order...\n", pterms_);
00612
00613
00614 cluster_centers_.Init(dim_, num_cluster_desired_);
00615 index_during_clustering_.Init(num_cluster_desired_);
00616 cluster_index_.Init(num_reference_points_);
00617 cluster_radii_.Init(num_cluster_desired_);
00618 num_reference_points_in_cluster_.Init(num_cluster_desired_);
00619
00620 printf("Now clustering...\n");
00621
00622
00623
00624 max_radius_cluster_ = KCenterClustering();
00625
00626
00627 ComputeCenters();
00628
00629
00630 IFGTChooseTruncationNumber_();
00631
00632 total_num_coeffs_ =
00633 (int) math::BinomialCoefficient(pterms_ + dim_ - 1, dim_);
00634 weighted_coeffs_.Init(total_num_coeffs_, num_cluster_desired_);
00635 unweighted_coeffs_.Init(total_num_coeffs_, num_cluster_desired_);
00636
00637 printf("Maximum radius generated in the cluster: %g...\n",
00638 max_radius_cluster_);
00639 printf("Truncation order updated to %d after clustering...\n",
00640 pterms_);
00641
00642
00643 printf("Now computing Taylor coefficients...\n");
00644 TaylorExpansion();
00645 printf("Taylor coefficient computation finished...\n");
00646 fx_timer_stop(module_, "ifgt_kde_preprocess");
00647 printf("Preprocessing step finished...\n");
00648 }
00649
00650 void Compute() {
00651
00652 printf("Starting the original IFGT-based KDE computation...\n");
00653
00654 fx_timer_start(module_, "original_ifgt_kde_compute");
00655
00656 Vector dy;
00657 dy.Init(dim_);
00658
00659 Vector tempy;
00660 tempy.Init(dim_);
00661
00662 Vector prods;
00663 prods.Init(total_num_coeffs_);
00664
00665 ArrayList<int> heads;
00666 heads.Init(dim_);
00667
00668
00669 densities_.SetZero();
00670
00671 for(int m = 0; m < query_set_.n_cols(); m++) {
00672
00673
00674 for(int kn = 0; kn < num_cluster_desired_; kn++) {
00675
00676 double sum2 = 0.0;
00677
00678
00679
00680 for (int i = 0; i < dim_; i++) {
00681 dy[i] = (query_set_.get(i, m) - cluster_centers_.get(i, kn)) /
00682 bandwidth_factor_;
00683 sum2 += dy[i] * dy[i];
00684 }
00685
00686
00687
00688 if (sum2 > (cut_off_radius_ + cluster_radii_[kn]) /
00689 (bandwidth_factor_ * bandwidth_factor_)) {
00690 continue;
00691 }
00692
00693 for(int i = 0; i < dim_; i++) {
00694 heads[i] = 0;
00695 }
00696
00697 prods[0] = exp(-sum2);
00698 for(int k = 1, t = 1, tail = 1; k < pterms_; k++, tail = t) {
00699 for (int i = 0; i < dim_; i++) {
00700 int head = heads[i];
00701 heads[i] = t;
00702 for(int j = head; j < tail; j++, t++)
00703 prods[t] = dy[i] * prods[j];
00704 }
00705 }
00706
00707 for(int i = 0; i < total_num_coeffs_; i++) {
00708 densities_[m] += weighted_coeffs_.get(i, kn) * prods[i];
00709 }
00710
00711 }
00712 }
00713
00714
00715 NormalizeDensities();
00716
00717 fx_timer_stop(module_, "original_ifgt_kde_compute");
00718 printf("Computation finished...\n");
00719 return;
00720 }
00721
00722 void PrintDebug() {
00723
00724 FILE *stream = stdout;
00725 const char *fname = NULL;
00726
00727 if((fname = fx_param_str(module_, "ifgt_kde_output", NULL)) != NULL) {
00728 stream = fopen(fname, "w+");
00729 }
00730 for(index_t q = 0; q < query_set_.n_cols(); q++) {
00731 fprintf(stream, "%g\n", densities_[q]);
00732 }
00733
00734 if(stream != stdout) {
00735 fclose(stream);
00736 }
00737 }
00738
00739 };
00740
00741 #endif