original_ifgt.h

Go to the documentation of this file.
00001 /* MLPACK 0.2
00002  *
00003  * Copyright (c) 2008, 2009 Alexander Gray,
00004  *                          Garry Boyer,
00005  *                          Ryan Riegel,
00006  *                          Nikolaos Vasiloglou,
00007  *                          Dongryeol Lee,
00008  *                          Chip Mappus, 
00009  *                          Nishant Mehta,
00010  *                          Hua Ouyang,
00011  *                          Parikshit Ram,
00012  *                          Long Tran,
00013  *                          Wee Chin Wong
00014  *
00015  * Copyright (c) 2008, 2009 Georgia Institute of Technology
00016  *
00017  * This program is free software; you can redistribute it and/or
00018  * modify it under the terms of the GNU General Public License as
00019  * published by the Free Software Foundation; either version 2 of the
00020  * License, or (at your option) any later version.
00021  *
00022  * This program is distributed in the hope that it will be useful, but
00023  * WITHOUT ANY WARRANTY; without even the implied warranty of
00024  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00025  * General Public License for more details.
00026  *
00027  * You should have received a copy of the GNU General Public License
00028  * along with this program; if not, write to the Free Software
00029  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00030  * 02110-1301, USA.
00031  */
00103 #ifndef ORIGINAL_IFGT_H
00104 #define ORIGINAL_IFGT_H
00105 
00106 #include "fastlib/fastlib.h"
00107 
00108 class OriginalIFGT {
00109 
00110  private:
00111 
00113 
00115   struct datanode *module_;
00116 
00118   int dim_;
00119 
00121   int num_reference_points_;
00122 
00124   Matrix query_set_;
00125 
00127   Matrix reference_set_;
00128 
00130   Vector reference_weights_;
00131 
00133   double bandwidth_;
00134 
00136   double bandwidth_sq_;
00137 
00144   double bandwidth_factor_;
00145 
00147   double epsilon_;
00148 
00150   int pterms_;
00151 
00153   int total_num_coeffs_;
00154   
00156   Matrix weighted_coeffs_;
00157 
00159   Matrix unweighted_coeffs_;
00160 
00162   int num_cluster_desired_;
00163 
00169   double cut_off_radius_;
00170 
00173   double max_radius_cluster_;
00174 
00177   Matrix cluster_centers_;
00178 
00182   ArrayList<int> index_during_clustering_;
00183 
00187   ArrayList<int> cluster_index_;
00188 
00192   Vector cluster_radii_;
00193 
00196   ArrayList<int> num_reference_points_in_cluster_;
00197 
00200   Vector densities_;
00201 
00203   void TaylorExpansion() {
00204       
00205     Vector tmp_coeffs;
00206     tmp_coeffs.Init(total_num_coeffs_);
00207     
00208     ComputeUnweightedCoeffs_(tmp_coeffs);
00209     
00210     ComputeWeightedCoeffs_(tmp_coeffs);
00211     
00212     return;     
00213   }
00214 
00215   void ComputeUnweightedCoeffs_(Vector &taylor_coeffs) {
00216         
00217     ArrayList<int> heads;
00218     heads.Init(dim_ + 1);
00219     ArrayList<int> cinds;
00220     cinds.Init(total_num_coeffs_);
00221     
00222     for (int i = 0; i < dim_; i++) {
00223       heads[i] = 0;
00224     }
00225     heads[dim_] = INT_MAX;
00226     
00227     cinds[0] = 0;
00228     taylor_coeffs[0] = 1.0;
00229     
00230     for(int k = 1, t = 1, tail = 1; k < pterms_; k++, tail = t) {
00231       for(int i = 0; i < dim_; i++) {
00232         int head = heads[i];
00233         heads[i] = t;
00234         for(int j = head; j < tail; j++, t++) {
00235           cinds[t] = (j < heads[i+1]) ? cinds[j] + 1 : 1;
00236           taylor_coeffs[t] = 2.0 * taylor_coeffs[j];
00237           taylor_coeffs[t] /= (double) cinds[t];
00238         }
00239       }
00240     }
00241     return;    
00242   }
00243 
00244   void ComputeWeightedCoeffs_(Vector &taylor_coeffs) {
00245     
00246     Vector dx;
00247     dx.Init(dim_);
00248     Vector prods;
00249     prods.Init(total_num_coeffs_);
00250     ArrayList<int> heads;
00251     heads.Init(dim_);
00252     
00253     // initialize coefficients for all clusters to be zero.
00254     weighted_coeffs_.SetZero();
00255     unweighted_coeffs_.SetZero();
00256     
00257     for(int n = 0; n < num_reference_points_; n++) {
00258       
00259       int ix2c = cluster_index_[n];
00260       double sum = 0.0;
00261       
00262       for(int i = 0; i < dim_; i++) {
00263         dx[i] = (reference_set_.get(i, n) - cluster_centers_.get(i, ix2c)) / 
00264           bandwidth_factor_;
00265 
00266         sum -= dx[i] * dx[i];
00267         heads[i] = 0;
00268       }
00269       
00270       prods[0] = exp(sum);
00271       for(int k = 1, t = 1, tail = 1; k < pterms_; k++, tail = t) {
00272         
00273         for (int i = 0; i < dim_; i++) {
00274           int head = heads[i];
00275           heads[i] = t;
00276           for(int j = head; j < tail; j++, t++)
00277             prods[t] = dx[i] * prods[j];
00278         } // for i
00279       } // for k
00280       
00281       // compute the weighted coefficients and unweighted coefficients.
00282       for(int i = 0; i < total_num_coeffs_; i++) {
00283         weighted_coeffs_.set(i, ix2c, weighted_coeffs_.get(i, ix2c) +
00284                              reference_weights_[n] * prods[i]);
00285         unweighted_coeffs_.set(i, ix2c, unweighted_coeffs_.get(i, ix2c) +
00286                                prods[i]);
00287       }
00288       
00289     }// for n
00290     
00291     // normalize by the Taylor coefficients.
00292     for(int k = 0; k < num_cluster_desired_; k++) {
00293       for(int i = 0; i < total_num_coeffs_; i++) {
00294         weighted_coeffs_.set(i, k, weighted_coeffs_.get(i, k) * 
00295                              taylor_coeffs[i]);
00296         unweighted_coeffs_.set(i, k, unweighted_coeffs_.get(i, k) * 
00297                                taylor_coeffs[i]);
00298       }
00299     }
00300     return;
00301   }
00302 
00307   double ComputeCenters() {
00308 
00309     // set cluster max radius to zero
00310     max_radius_cluster_ = 0;
00311 
00312     // clear all centers.
00313     cluster_centers_.SetZero();
00314     
00315     // Compute the weighted centroid for each cluster.
00316     for(int j = 0; j < dim_; j++) {
00317       for(int i = 0; i < num_reference_points_; i++) {
00318         
00319         cluster_centers_.set(j, cluster_index_[i],
00320                              cluster_centers_.get(j, cluster_index_[i]) +
00321                              reference_set_.get(j, i));
00322       }
00323     }
00324     
00325     for(int j = 0; j < dim_; j++) {
00326       for(int i = 0; i < num_cluster_desired_; i++) {
00327         cluster_centers_.set(j, i, cluster_centers_.get(j, i) /
00328                              num_reference_points_in_cluster_[i]);
00329       }
00330     }
00331     
00332     // Now loop through and compute the radius of each cluster.
00333     cluster_radii_.SetZero();
00334     for(int i = 0; i < num_reference_points_; i++) {
00335       Vector reference_pt;
00336       reference_set_.MakeColumnVector(i, &reference_pt);
00337 
00338       // the index of the cluster this reference point belongs to.
00339       int cluster_id = cluster_index_[i];
00340       Vector center;
00341       cluster_centers_.MakeColumnVector(cluster_id, &center);
00342       cluster_radii_[cluster_id] = 
00343         std::max(cluster_radii_[cluster_id], 
00344                  sqrt(la::DistanceSqEuclidean(reference_pt, center)));
00345       max_radius_cluster_ =
00346         std::max(max_radius_cluster_, cluster_radii_[cluster_id]);
00347     }
00348 
00349     return max_radius_cluster_;
00350   }
00351 
00355   double KCenterClustering() {
00356     
00357     Vector distances_to_center;
00358     distances_to_center.Init(num_reference_points_);
00359     
00360     // randomly pick one node as the first center.
00361     srand( (unsigned)time( NULL ) );
00362     int ind = rand() % num_reference_points_;
00363     
00364     // add the ind-th node to the first center.
00365     index_during_clustering_[0] = ind;
00366     Vector first_center;
00367     reference_set_.MakeColumnVector(ind, &first_center);
00368     
00369     // compute the distances from each node to the first center and
00370     // initialize the index of the cluster ID to zero for all
00371     // reference points.
00372     for(int j = 0; j < num_reference_points_; j++) {
00373       Vector reference_point;
00374       reference_set_.MakeColumnVector(j, &reference_point);
00375       
00376       distances_to_center[j] = (j == ind) ? 
00377         0.0:la::DistanceSqEuclidean(reference_point, first_center);
00378       cluster_index_[j] = 0;
00379     }
00380     
00381     // repeat until the desired number of clusters is reached.
00382     for(int i = 1; i < num_cluster_desired_; i++) {
00383       
00384       // Find the reference point that is farthest away from the
00385       // current center.
00386       ind = IndexOfLargestElement(distances_to_center);
00387       
00388       // Add the ind-th node to the centroid list.
00389       index_during_clustering_[i] = ind;
00390       
00391       // Update the distances from each point to the current center.
00392       Vector center;
00393       reference_set_.MakeColumnVector(ind, &center);
00394       
00395       for (int j = 0; j < num_reference_points_; j++) {
00396         Vector reference_point;
00397         reference_set_.MakeColumnVector(j, &reference_point);
00398         double d = (j == ind)? 
00399           0.0:la::DistanceSqEuclidean(reference_point, center);
00400         
00401         if (d < distances_to_center[j]) {
00402           distances_to_center[j] = d;
00403           cluster_index_[j] = i;
00404         }
00405       }
00406     }
00407     
00408     // Find the maximum radius of the k-center algorithm.
00409     ind = IndexOfLargestElement(distances_to_center);
00410     
00411     double radius = distances_to_center[ind];
00412     
00413     
00414     for(int i = 0; i < num_cluster_desired_; i++) {
00415       num_reference_points_in_cluster_[i] = 0;
00416     }
00417     // tally up the number of reference points for each cluster.
00418     for (int i = 0; i < num_reference_points_; i++) {
00419       num_reference_points_in_cluster_[cluster_index_[i]]++;
00420     }
00421     
00422     return sqrt(radius);  
00423   }
00424 
00428   int IndexOfLargestElement(const Vector &x) {
00429     
00430     int largest_index = 0;
00431     double largest_quantity = -DBL_MAX;
00432     
00433     for(int i = 0; i < x.length(); i++) {
00434       if(largest_quantity < x[i]) {
00435         largest_quantity = x[i];
00436         largest_index = i;
00437       }
00438     }
00439     return largest_index;
00440   }
00441 
00446   void NormalizeDensities() {
00447     double norm_const = pow(2 * math::PI * bandwidth_sq_, 
00448                             query_set_.n_rows() / 2.0) *
00449       reference_set_.n_cols();
00450 
00451     for(index_t q = 0; q < query_set_.n_cols(); q++) {
00452       densities_[q] /= norm_const;
00453     }
00454   }
00455 
00456   void IFGTChooseTruncationNumber_() {  
00457 
00458     double rx = max_radius_cluster_;
00459     double max_diameter_of_the_datasets = sqrt(dim_);
00460     
00461     double two_h_square = bandwidth_factor_ * bandwidth_factor_;
00462     
00463     double r = min(max_diameter_of_the_datasets, 
00464                    bandwidth_factor_ * sqrt(log(1 / epsilon_)));
00465     
00466     int p_ul=300;
00467     
00468     double rx_square = rx * rx;
00469     
00470     double error = 1;
00471     double temp = 1;
00472     int p = 0;
00473     while((error > epsilon_) & (p <= p_ul)) {
00474       p++;
00475       double b = min(((rx + sqrt((rx_square) + (2 * p * two_h_square))) / 2),
00476                      rx + r);
00477       double c = rx - b;
00478       temp = temp * (((2 * rx * b) / two_h_square) / p);
00479       error = temp * (exp(-(c * c) / two_h_square));                    
00480     }   
00481     
00482     // update the truncation order.
00483     pterms_ = p;
00484 
00485     // update the cut-off radius
00486     cut_off_radius_ = r;
00487     
00488   }
00489   
00490   void IFGTChooseParameters_(int max_num_clusters) {
00491     
00492     // for references and queries that fit in the unit hypercube, this
00493     // assumption is true, but for general case it is not.
00494     double max_diamater_of_the_datasets = sqrt(dim_);
00495     
00496     double two_h_square = bandwidth_factor_ * bandwidth_factor_;
00497 
00498     // The cut-off radius.
00499     double r = min(max_diamater_of_the_datasets, 
00500                    bandwidth_factor_ * sqrt(log(1 / epsilon_)));
00501     
00502     // Upper limit on the truncation number.
00503     int p_ul=200; 
00504     
00505     num_cluster_desired_ = 1;
00506     
00507     double complexity_min=1e16;
00508     double rx;
00509 
00510     for(int i = 0; i < max_num_clusters; i++){
00511      
00512       // Compute an estimate of the maximum cluster radius.
00513       rx = pow((double) i + 1, -1.0 / (double) dim_);
00514       double rx_square = rx * rx;
00515 
00516       // An estimate of the number of neighbors.
00517       double n = std::min(i + 1.0, pow(r / rx, (double) dim_));
00518       double error = 1;
00519       double temp = 1;
00520       int p = 0;
00521 
00522       // Choose the truncation order.
00523       while((error > epsilon_) & (p <= p_ul)) {
00524         p++;
00525         double b = 
00526           std::min(((rx + sqrt((rx_square) + (2 * p * two_h_square))) / 2.0),
00527                    rx + r);
00528         double c = rx - b;
00529         temp = temp * (((2 * rx * b) / two_h_square) / p);
00530         error = temp * (exp(-(c * c) / two_h_square));
00531       }
00532       double complexity = (i + 1) + log((double) i + 1) + 
00533         ((1 + n) * math::BinomialCoefficient(p - 1 + dim_, dim_));
00534         
00535       if(complexity < complexity_min) {
00536         complexity_min = complexity;
00537         num_cluster_desired_ = i + 1;
00538         pterms_ = p;
00539       }
00540     }    
00541   }
00542 
00543  public:
00544 
00546 
00549   OriginalIFGT() {}
00550 
00553   ~OriginalIFGT() {}
00554 
00555   
00557 
00563   void get_density_estimates(Vector *results) {
00564     results->Init(densities_.length());
00565 
00566     for(index_t i = 0; i < densities_.length(); i++) {
00567       (*results)[i] = densities_[i];
00568     }
00569   }
00570 
00572 
00573   void Init(Matrix &queries, Matrix &references, struct datanode *module_in) {
00574     
00575     // set module to the incoming one.
00576     module_ = module_in;
00577     
00578     // set dimensionality
00579     dim_ = references.n_rows();
00580         
00581     // set up query set and reference set.
00582     query_set_.Copy(queries);
00583     reference_set_.Copy(references);
00584     num_reference_points_ = reference_set_.n_cols();
00585     
00586     // By default, the we do uniform weights only
00587     reference_weights_.Init(reference_set_.n_cols());
00588     reference_weights_.SetAll(1);
00589 
00590     // initialize density estimate vector
00591     densities_.Init(query_set_.n_cols());
00592     
00593     // A "hack" such that the code uses the proper Gaussian kernel.
00594     bandwidth_ = fx_param_double_req(module_, "bandwidth");
00595     bandwidth_sq_ = bandwidth_ * bandwidth_;
00596     bandwidth_factor_ = sqrt(2) * bandwidth_;
00597     
00598     // Read in the desired absolute error accuracy.
00599     epsilon_ = fx_param_double(module_, "absolute_error", 0.1);
00600 
00601     // This is the upper limit on the number of clusters.
00602     int cluster_limit = (int) ceilf(20.0 * sqrt(dim_) / sqrt(bandwidth_));
00603     
00604     printf("Automatic parameter selection phase...\n");
00605 
00606     printf("Preprocessing phase for the original IFGT...\n");
00607 
00608     fx_timer_start(module_, "ifgt_kde_preprocess");
00609     IFGTChooseParameters_(cluster_limit);
00610     printf("Chose %d clusters...\n", num_cluster_desired_);
00611     printf("Tentatively chose %d truncation order...\n", pterms_);
00612 
00613     // Allocate spaces for storing coefficients and clustering information.
00614     cluster_centers_.Init(dim_, num_cluster_desired_);
00615     index_during_clustering_.Init(num_cluster_desired_);
00616     cluster_index_.Init(num_reference_points_);
00617     cluster_radii_.Init(num_cluster_desired_);
00618     num_reference_points_in_cluster_.Init(num_cluster_desired_);    
00619     
00620     printf("Now clustering...\n");
00621 
00622     // Divide the source space into num_cluster_desired_ parts using
00623     // K-center algorithm
00624     max_radius_cluster_ = KCenterClustering();
00625     
00626     // computer the center of the sources
00627     ComputeCenters();
00628 
00629     // Readjust the truncation order based on the actual clustering result.
00630     IFGTChooseTruncationNumber_();
00631     // pd = C_dim^(dim+pterms-1)
00632     total_num_coeffs_ = 
00633       (int) math::BinomialCoefficient(pterms_ + dim_ - 1, dim_);
00634     weighted_coeffs_.Init(total_num_coeffs_, num_cluster_desired_);
00635     unweighted_coeffs_.Init(total_num_coeffs_, num_cluster_desired_);
00636 
00637     printf("Maximum radius generated in the cluster: %g...\n",
00638                 max_radius_cluster_);
00639     printf("Truncation order updated to %d after clustering...\n", 
00640                 pterms_);
00641 
00642     // Compute coefficients.    
00643     printf("Now computing Taylor coefficients...\n");
00644     TaylorExpansion();
00645     printf("Taylor coefficient computation finished...\n");
00646     fx_timer_stop(module_, "ifgt_kde_preprocess");
00647     printf("Preprocessing step finished...\n");
00648   }
00649 
00650   void Compute() {
00651     
00652     printf("Starting the original IFGT-based KDE computation...\n");
00653 
00654     fx_timer_start(module_, "original_ifgt_kde_compute");
00655 
00656     Vector dy;
00657     dy.Init(dim_);
00658     
00659     Vector tempy;
00660     tempy.Init(dim_);
00661     
00662     Vector prods;
00663     prods.Init(total_num_coeffs_);
00664     
00665     ArrayList<int> heads;
00666     heads.Init(dim_);
00667     
00668     // make sure the sum for each query point starts at zero.
00669     densities_.SetZero();
00670     
00671     for(int m = 0; m < query_set_.n_cols(); m++) {      
00672       
00673       // loop over each cluster and evaluate Taylor expansions.
00674       for(int kn = 0; kn < num_cluster_desired_; kn++) {
00675         
00676         double sum2 = 0.0;
00677         
00678         // compute the ratio of the squared distance between each query
00679         // point and each cluster center to the bandwidth factor.
00680         for (int i = 0; i < dim_; i++) {
00681           dy[i] = (query_set_.get(i, m) - cluster_centers_.get(i, kn)) / 
00682             bandwidth_factor_;
00683           sum2 += dy[i] * dy[i];
00684         }
00685         
00686         // If the ratio is greater than the cut-off, this cluster's
00687         // contribution is ignored.
00688         if (sum2 > (cut_off_radius_ + cluster_radii_[kn]) /
00689             (bandwidth_factor_ * bandwidth_factor_)) {
00690           continue;
00691         }
00692         
00693         for(int i = 0; i < dim_; i++) {
00694           heads[i] = 0;
00695         }
00696         
00697         prods[0] = exp(-sum2);          
00698         for(int k = 1, t = 1, tail = 1; k < pterms_; k++, tail = t) {
00699           for (int i = 0; i < dim_; i++) {
00700             int head = heads[i];
00701             heads[i] = t;
00702             for(int j = head; j < tail; j++, t++)
00703               prods[t] = dy[i] * prods[j];
00704           } // for i
00705         }// for k
00706         
00707         for(int i = 0; i < total_num_coeffs_; i++) {
00708           densities_[m] += weighted_coeffs_.get(i, kn) * prods[i];
00709         }
00710         
00711       } // for each cluster
00712     } //for each query point
00713 
00714     // normalize density estimates
00715     NormalizeDensities();
00716 
00717     fx_timer_stop(module_, "original_ifgt_kde_compute");
00718     printf("Computation finished...\n");
00719     return;
00720   }
00721 
00722   void PrintDebug() {
00723     
00724     FILE *stream = stdout;
00725     const char *fname = NULL;
00726     
00727     if((fname = fx_param_str(module_, "ifgt_kde_output", NULL)) != NULL) {
00728       stream = fopen(fname, "w+");
00729     }
00730     for(index_t q = 0; q < query_set_.n_cols(); q++) {
00731       fprintf(stream, "%g\n", densities_[q]);
00732     }
00733     
00734     if(stream != stdout) {
00735       fclose(stream);
00736     }
00737   }
00738 
00739 };
00740 
00741 #endif