00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00041 #ifndef DATASET_SCALER_H
00042 #define DATASET_SACLER_H
00043
00044 #include <fastlib/fastlib.h>
00045
00055 class DatasetScaler {
00056
00057 public:
00058
00067 static void TranslateDataByMin(Matrix &qset, Matrix &rset,
00068 bool queries_equal_references) {
00069
00070 int num_dims = rset.n_rows();
00071 DHrectBound<2> qset_bound;
00072 DHrectBound<2> rset_bound;
00073 qset_bound.Init(qset.n_rows());
00074 rset_bound.Init(qset.n_rows());
00075
00076
00077 for(index_t r = 0; r < rset.n_cols(); r++) {
00078 Vector ref_vector;
00079 rset.MakeColumnVector(r, &ref_vector);
00080 rset_bound |= ref_vector;
00081 }
00082 for(index_t q = 0; q < qset.n_cols(); q++) {
00083 Vector query_vector;
00084 qset.MakeColumnVector(q, &query_vector);
00085 qset_bound |= query_vector;
00086 }
00087
00088 for(index_t i = 0; i < num_dims; i++) {
00089 DRange qset_range = qset_bound.get(i);
00090 DRange rset_range = rset_bound.get(i);
00091 double min_coord = min(qset_range.lo, rset_range.lo);
00092 double max_coord = max(qset_range.hi, rset_range.hi);
00093
00094 printf("Dimension %d range: [%g, %g]\n", i, min_coord, max_coord);
00095
00096 for(index_t j = 0; j < rset.n_cols(); j++) {
00097 rset.set(i, j, rset.get(i, j) - min_coord);
00098 }
00099
00100 if(!queries_equal_references) {
00101 for(index_t j = 0; j < qset.n_cols(); j++) {
00102 qset.set(i, j, qset.get(i, j) - min_coord);
00103 }
00104 }
00105 }
00106 }
00107
00117 static void ScaleDataByMinMax(Matrix &qset, Matrix &rset,
00118 bool queries_equal_references) {
00119
00120 index_t num_dims = qset.n_rows();
00121 DHrectBound<2> total_bound;
00122 total_bound.Init(qset.n_rows());
00123
00124
00125 for(index_t r = 0; r < rset.n_cols(); r++) {
00126 Vector ref_vector;
00127 rset.MakeColumnVector(r, &ref_vector);
00128 total_bound |= ref_vector;
00129 }
00130 if(!queries_equal_references) {
00131 for(index_t q = 0; q < qset.n_cols(); q++) {
00132 Vector query_vector;
00133 qset.MakeColumnVector(q, &query_vector);
00134 total_bound |= query_vector;
00135 }
00136 }
00137
00138 for(index_t i = 0; i < num_dims; i++) {
00139 DRange total_range = total_bound.get(i);
00140 double min_coord = total_range.lo;
00141 double max_coord = total_range.hi;
00142 double width = max_coord - min_coord;
00143
00144 printf("Dimension %d range: [%g, %g]\n", i, min_coord, max_coord);
00145
00146 for(index_t j = 0; j < rset.n_cols(); j++) {
00147 if(width > 0) {
00148 rset.set(i, j, (rset.get(i, j) - min_coord) / width);
00149 }
00150 else {
00151 rset.set(i, j, 0);
00152 }
00153 }
00154
00155 if(!queries_equal_references) {
00156 for(index_t j = 0; j < qset.n_cols(); j++) {
00157 if(width > 0) {
00158 qset.set(i, j, (qset.get(i, j) - min_coord) / width);
00159 }
00160 else {
00161 qset.set(i, j, 0);
00162 }
00163 }
00164 }
00165 }
00166 }
00167
00180 static void StandardizeData(Matrix &qset, Matrix &rset,
00181 bool queries_equal_references) {
00182
00183 Vector mean_vector, standard_deviation_vector;
00184
00185 mean_vector.Init(qset.n_rows());
00186 mean_vector.SetZero();
00187 standard_deviation_vector.Init(qset.n_rows());
00188 standard_deviation_vector.SetZero();
00189
00190
00191
00192 for(index_t r = 0; r < rset.n_cols(); r++) {
00193 la::AddTo(rset.n_rows(), rset.GetColumnPtr(r), mean_vector.ptr());
00194 }
00195 if(!queries_equal_references) {
00196 for(index_t q = 0; q < qset.n_cols(); q++) {
00197 la::AddTo(qset.n_rows(), qset.GetColumnPtr(q), mean_vector.ptr());
00198 }
00199 la::Scale(qset.n_rows(), 1.0 / ((double) qset.n_cols() + rset.n_cols()),
00200 mean_vector.ptr());
00201 }
00202 else {
00203 la::Scale(qset.n_rows(), 1.0 / ((double) qset.n_cols()),
00204 mean_vector.ptr());
00205 }
00206
00207
00208 for(index_t r = 0; r < rset.n_cols(); r++) {
00209 for(index_t i = 0; i < rset.n_rows(); i++) {
00210 standard_deviation_vector[i] +=
00211 math::Sqr(rset.get(i, r) - mean_vector[i]);
00212 }
00213 }
00214 if(!queries_equal_references) {
00215 for(index_t q = 0; q < qset.n_cols(); q++) {
00216 for(index_t i = 0; i < qset.n_rows(); i++) {
00217 standard_deviation_vector[i] +=
00218 math::Sqr(qset.get(i, q) - mean_vector[i]);
00219 }
00220 }
00221 la::Scale(qset.n_rows(),
00222 1.0 / ((double) qset.n_cols() + rset.n_cols() - 1),
00223 standard_deviation_vector.ptr());
00224 }
00225 else {
00226 la::Scale(rset.n_rows(), 1.0 / ((double) rset.n_cols()),
00227 standard_deviation_vector.ptr());
00228 }
00229
00230
00231
00232 for(index_t r = 0; r < rset.n_cols(); r++) {
00233 for(index_t d = 0; d < rset.n_rows(); d++) {
00234 rset.set(d, r, (rset.get(d, r) - mean_vector[d]) /
00235 standard_deviation_vector[d]);
00236 }
00237 }
00238 if(!queries_equal_references) {
00239 for(index_t q = 0; q < qset.n_cols(); q++) {
00240 for(index_t d = 0; d < qset.n_rows(); d++) {
00241 qset.set(d, q, (qset.get(d, q) - mean_vector[d]) /
00242 standard_deviation_vector[d]);
00243 }
00244 }
00245 }
00246 }
00247
00248 };
00249
00250 #endif