dataset_scaler.h

Go to the documentation of this file.
00001 /* MLPACK 0.2
00002  *
00003  * Copyright (c) 2008, 2009 Alexander Gray,
00004  *                          Garry Boyer,
00005  *                          Ryan Riegel,
00006  *                          Nikolaos Vasiloglou,
00007  *                          Dongryeol Lee,
00008  *                          Chip Mappus, 
00009  *                          Nishant Mehta,
00010  *                          Hua Ouyang,
00011  *                          Parikshit Ram,
00012  *                          Long Tran,
00013  *                          Wee Chin Wong
00014  *
00015  * Copyright (c) 2008, 2009 Georgia Institute of Technology
00016  *
00017  * This program is free software; you can redistribute it and/or
00018  * modify it under the terms of the GNU General Public License as
00019  * published by the Free Software Foundation; either version 2 of the
00020  * License, or (at your option) any later version.
00021  *
00022  * This program is distributed in the hope that it will be useful, but
00023  * WITHOUT ANY WARRANTY; without even the implied warranty of
00024  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00025  * General Public License for more details.
00026  *
00027  * You should have received a copy of the GNU General Public License
00028  * along with this program; if not, write to the Free Software
00029  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00030  * 02110-1301, USA.
00031  */
00041 #ifndef DATASET_SCALER_H
00042 #define DATASET_SACLER_H
00043 
00044 #include <fastlib/fastlib.h>
00045 
00055 class DatasetScaler {
00056 
00057  public:
00058 
00067   static void TranslateDataByMin(Matrix &qset, Matrix &rset,
00068                                  bool queries_equal_references) {
00069     
00070     int num_dims = rset.n_rows();
00071     DHrectBound<2> qset_bound;
00072     DHrectBound<2> rset_bound;
00073     qset_bound.Init(qset.n_rows());
00074     rset_bound.Init(qset.n_rows());
00075 
00076     // go through each query/reference point to find out the bounds
00077     for(index_t r = 0; r < rset.n_cols(); r++) {
00078       Vector ref_vector;
00079       rset.MakeColumnVector(r, &ref_vector);
00080       rset_bound |= ref_vector;
00081     }
00082     for(index_t q = 0; q < qset.n_cols(); q++) {
00083       Vector query_vector;
00084       qset.MakeColumnVector(q, &query_vector);
00085       qset_bound |= query_vector;
00086     }
00087 
00088     for(index_t i = 0; i < num_dims; i++) {
00089       DRange qset_range = qset_bound.get(i);
00090       DRange rset_range = rset_bound.get(i);
00091       double min_coord = min(qset_range.lo, rset_range.lo);
00092       double max_coord = max(qset_range.hi, rset_range.hi);
00093 
00094       printf("Dimension %d range: [%g, %g]\n", i, min_coord, max_coord);
00095 
00096       for(index_t j = 0; j < rset.n_cols(); j++) {
00097         rset.set(i, j, rset.get(i, j) - min_coord);
00098       }
00099 
00100       if(!queries_equal_references) {
00101         for(index_t j = 0; j < qset.n_cols(); j++) {
00102           qset.set(i, j, qset.get(i, j) - min_coord);
00103         }
00104       }
00105     }
00106   }
00107 
00117   static void ScaleDataByMinMax(Matrix &qset, Matrix &rset,
00118                                 bool queries_equal_references) {
00119     
00120     index_t num_dims = qset.n_rows();
00121     DHrectBound<2> total_bound;
00122     total_bound.Init(qset.n_rows());
00123 
00124     // go through each query/reference point to find out the bounds
00125     for(index_t r = 0; r < rset.n_cols(); r++) {
00126       Vector ref_vector;
00127       rset.MakeColumnVector(r, &ref_vector);
00128       total_bound |= ref_vector;
00129     }
00130     if(!queries_equal_references) {
00131       for(index_t q = 0; q < qset.n_cols(); q++) {
00132         Vector query_vector;
00133         qset.MakeColumnVector(q, &query_vector);
00134         total_bound |= query_vector;
00135       }
00136     }
00137 
00138     for(index_t i = 0; i < num_dims; i++) {
00139       DRange total_range = total_bound.get(i);
00140       double min_coord = total_range.lo;
00141       double max_coord = total_range.hi;
00142       double width = max_coord - min_coord;
00143 
00144       printf("Dimension %d range: [%g, %g]\n", i, min_coord, max_coord);
00145 
00146       for(index_t j = 0; j < rset.n_cols(); j++) {
00147         if(width > 0) {
00148           rset.set(i, j, (rset.get(i, j) - min_coord) / width);
00149         }
00150         else {
00151           rset.set(i, j, 0);
00152         }
00153       }
00154 
00155       if(!queries_equal_references) {
00156         for(index_t j = 0; j < qset.n_cols(); j++) {
00157           if(width > 0) {
00158             qset.set(i, j, (qset.get(i, j) - min_coord) / width);
00159           }
00160           else {
00161             qset.set(i, j, 0);
00162           }
00163         }
00164       }
00165     }
00166   }
00167 
00180   static void StandardizeData(Matrix &qset, Matrix &rset,
00181                               bool queries_equal_references) {
00182 
00183     Vector mean_vector, standard_deviation_vector;
00184 
00185     mean_vector.Init(qset.n_rows());
00186     mean_vector.SetZero();
00187     standard_deviation_vector.Init(qset.n_rows());
00188     standard_deviation_vector.SetZero();
00189 
00190     // Go through each query/reference point to find out the mean
00191     // vectors.
00192     for(index_t r = 0; r < rset.n_cols(); r++) {
00193       la::AddTo(rset.n_rows(), rset.GetColumnPtr(r), mean_vector.ptr());
00194     }
00195     if(!queries_equal_references) {
00196       for(index_t q = 0; q < qset.n_cols(); q++) {
00197         la::AddTo(qset.n_rows(), qset.GetColumnPtr(q), mean_vector.ptr());
00198       }
00199       la::Scale(qset.n_rows(), 1.0 / ((double) qset.n_cols() + rset.n_cols()),
00200                 mean_vector.ptr());
00201     }
00202     else {
00203       la::Scale(qset.n_rows(), 1.0 / ((double) qset.n_cols()),
00204                 mean_vector.ptr());
00205     }
00206 
00207     // Now find out the standard deviation along each dimension.
00208     for(index_t r = 0; r < rset.n_cols(); r++) {
00209       for(index_t i = 0; i < rset.n_rows(); i++) {
00210         standard_deviation_vector[i] += 
00211           math::Sqr(rset.get(i, r) - mean_vector[i]);
00212       }
00213     }
00214     if(!queries_equal_references) {
00215       for(index_t q = 0; q < qset.n_cols(); q++) {
00216         for(index_t i = 0; i < qset.n_rows(); i++) {
00217           standard_deviation_vector[i] +=
00218             math::Sqr(qset.get(i, q) - mean_vector[i]);
00219         }
00220       }
00221       la::Scale(qset.n_rows(), 
00222                 1.0 / ((double) qset.n_cols() + rset.n_cols() - 1),
00223                 standard_deviation_vector.ptr());
00224     }
00225     else {
00226       la::Scale(rset.n_rows(), 1.0 / ((double) rset.n_cols()),
00227                 standard_deviation_vector.ptr());
00228     }
00229 
00230     // Now scale the datasets using the computed mean and the standard
00231     // deviation.
00232     for(index_t r = 0; r < rset.n_cols(); r++) {
00233       for(index_t d = 0; d < rset.n_rows(); d++) {
00234         rset.set(d, r, (rset.get(d, r) - mean_vector[d]) / 
00235                  standard_deviation_vector[d]);
00236       }
00237     }
00238     if(!queries_equal_references) {
00239       for(index_t q = 0; q < qset.n_cols(); q++) {
00240         for(index_t d = 0; d < qset.n_rows(); d++) {
00241           qset.set(d, q, (qset.get(d, q) - mean_vector[d]) /
00242                    standard_deviation_vector[d]);
00243         }
00244       }
00245     }
00246   }
00247 
00248 };
00249 
00250 #endif
Generated on Mon Jan 24 12:04:38 2011 for FASTlib by  doxygen 1.6.3