disk_allnn.h

00001 /* MLPACK 0.2
00002  *
00003  * Copyright (c) 2008, 2009 Alexander Gray,
00004  *                          Garry Boyer,
00005  *                          Ryan Riegel,
00006  *                          Nikolaos Vasiloglou,
00007  *                          Dongryeol Lee,
00008  *                          Chip Mappus, 
00009  *                          Nishant Mehta,
00010  *                          Hua Ouyang,
00011  *                          Parikshit Ram,
00012  *                          Long Tran,
00013  *                          Wee Chin Wong
00014  *
00015  * Copyright (c) 2008, 2009 Georgia Institute of Technology
00016  *
00017  * This program is free software; you can redistribute it and/or
00018  * modify it under the terms of the GNU General Public License as
00019  * published by the Free Software Foundation; either version 2 of the
00020  * License, or (at your option) any later version.
00021  *
00022  * This program is distributed in the hope that it will be useful, but
00023  * WITHOUT ANY WARRANTY; without even the implied warranty of
00024  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00025  * General Public License for more details.
00026  *
00027  * You should have received a copy of the GNU General Public License
00028  * along with this program; if not, write to the Free Software
00029  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00030  * 02110-1301, USA.
00031  */
00039 #ifndef DISK_ALLNN_H
00040 #define DISK_ALLNN_H
00041 
00042 #include <fastlib/fastlib.h>
00043 
00044 const fx_entry_doc allnn_entries[] = {
00045   {"leaf_size", FX_PARAM, FX_INT, NULL,
00046    "  The maximum number of points to store at a leaf.\n"},
00047   {"tree_building", FX_TIMER, FX_CUSTOM, NULL,
00048    "  Time spent building the kd-tree.\n"},
00049   {"dual_tree_computation", FX_TIMER, FX_CUSTOM, NULL,
00050    "  Time spent computing the nearest neighbors.\n"},
00051   {"number_of_prunes", FX_RESULT, FX_INT, NULL,
00052    "  Total node-pairs found to be too far to matter.\n"},
00053   FX_ENTRY_DOC_DONE
00054 };
00055 
00056 const fx_module_doc allnn_doc = {
00057   allnn_entries, NULL,
00058   "Performs dual-tree all-nearest-neighbors computation.\n"
00059 };
00060 
00061 const fx_entry_doc allnn_naive_entries[] = {
00062   {"naive_time", FX_TIMER, FX_CUSTOM, NULL,
00063    "  Time spend performing the naive computation.\n"},
00064   FX_ENTRY_DOC_DONE
00065 };
00066 
00067 const fx_module_doc allnn_naive_doc = {
00068   allnn_naive_entries, NULL,
00069   "Performs naive all-nearest-neighbors computation.\n"
00070 };
00071 
00095 class DiskAllNN {
00096 
00098 
00099  private:
00107   class QueryStat {
00108 
00109    private:
00114     double max_distance_so_far_;
00115     // these pointers point to the addresses where the subtrees live
00116     ptrdiff_t left_usage_;
00117     ptrdiff_t right_usage_;
00118     
00119    public:
00120     static void *operator new(size_t size) {
00121       return  mmapmm::MemoryManager<false>::allocator_->Alloc<QueryStat>();
00122     } 
00123 
00124     static void operator delete(void *p) {
00125     
00126     } 
00127     QueryStat() {
00128       left_usage_=0;
00129       right_usage_=0;
00130  
00131     }   
00132     double max_distance_so_far() {
00133       return max_distance_so_far_;
00134     }
00135 
00136     void set_max_distance_so_far(double new_dist) {
00137       max_distance_so_far_ = new_dist;
00138     }
00139     
00140     void set_left_usage(ptrdiff_t usage) {
00141       left_usage_=usage;
00142     }
00143     
00144     void set_right_usage(ptrdiff_t usage) {
00145       right_usage_=usage;
00146     }
00147     
00148     ptrdiff_t left_usage() {
00149       return left_usage_;
00150     }
00151 
00152     ptrdiff_t right_usage() {
00153       return left_usage_;
00154     }
00155     
00163     void Init(const Matrix& matrix, index_t start, index_t count) {
00164       // The bound starts at infinity
00165       max_distance_so_far_ = DBL_MAX;
00166    }
00167 
00174     void Init(const Matrix& matrix, index_t start, index_t count,
00175               const QueryStat& left, const QueryStat& right) {
00176       Init(matrix, start, count);
00177     }
00178 
00179   }; /* class AllNNStat */
00180 
00181   // The tree directory defines several tools for the creation of
00182   // custom tree types, especially for kd-trees.  The DHrectBound<2>
00183   // gives us the normal kind of kd-tree bounding boxes, using the
00184   // 2-norm, and Matrix specifies the storage type of our data.
00185 
00187   typedef BinarySpaceTreeMmap<DHrectBoundMmap<2>, Matrix, QueryStat> TreeType;
00188 
00190 
00191  private:
00193   struct datanode* module_;
00194 
00196   Matrix queries_;
00198   Matrix references_;
00199 
00201   TreeType* query_tree_;
00203   TreeType* reference_tree_;
00204 
00206   index_t leaf_size_;
00207 
00209   GenVector<index_t> old_from_new_queries_;
00211   GenVector<index_t> old_from_new_references_;
00212 
00217   Vector neighbor_distances_;
00222   GenVector<index_t> neighbor_indices_;
00223 
00225   index_t number_of_prunes_;
00226   
00228   ptrdiff_t advice_limit_upper_;
00229 
00231   ptrdiff_t advice_limit_lower_;
00232 
00234   bool initialized_;
00236   bool already_used_;
00237 
00238 
00240 
00241   FORBID_ACCIDENTAL_COPIES(DiskAllNN);
00242 
00243  public:
00244   // Default constructors should be kept very simple and should never
00245   // allocate memory.  Their two responsibilities are to ensure that
00246   // it's safe to destroy the object without having otherwise used it
00247   // (e.g. to set pointers to NULL) and to poison memory when in debug
00248   // mode with BIG_BAD_NUMBER = 2146666666 = NaN as a double and
00249   // BIG_BAD_POINTER = 0xdeadbeef.
00250   DiskAllNN() {
00251     query_tree_ = NULL;
00252     reference_tree_ = NULL;
00253 
00254     DEBUG_POISON_PTR(module_);
00255     DEBUG_ONLY(leaf_size_ = BIG_BAD_NUMBER);
00256     DEBUG_ONLY(number_of_prunes_ = BIG_BAD_NUMBER);
00257 
00258     DEBUG_ONLY(initialized_ = false);
00259     DEBUG_ONLY(already_used_ = false);
00260   }
00261 
00262   // Note that we don't delete the fx module; it's managed externally.
00263   ~DiskAllNN() {
00264 
00265   }
00266 
00268 
00273   double MinNodeDistSq_(TreeType* query_node, TreeType* reference_node) {
00274     return query_node->bound().MinDistanceSq(reference_node->bound());
00275   }
00276 
00277 
00283   void GNPBaseCase_(TreeType* query_node, TreeType* reference_node) {
00284 
00285     // Debug checks should be used frequently.  They incur no overhead
00286     // when compiled in --mode=fast and very little otherwise.
00287 
00288     /* Make sure we didn't try to split children */
00289     DEBUG_ASSERT(query_node != NULL);
00290     DEBUG_ASSERT(reference_node != NULL);
00291 
00292     /* Make sure we should be in the base case */
00293     DEBUG_WARN_IF(!query_node->is_leaf());
00294     DEBUG_WARN_IF(!reference_node->is_leaf());
00295 
00296     /* Used to find the query node's new upper bound */
00297     double max_nearest_neighbor_distance = -1.0;
00298 
00299     /* Loop over all query-reference pairs */
00300 
00301     // Trees don't store their points, but instead give index ranges.
00302     // To make this feasible, they have to rearrange their input
00303     // matrices, which is why we were sure to make copies.
00304     for (index_t query_index = query_node->begin();
00305         query_index < query_node->end(); query_index++) {
00306 
00307       // MakeColumnVector aliases (i.e. points to but does not copy) a
00308       // column from the matrix.
00309       //
00310       // A brief aside: BLAS/LAPACK is coded in Fortran and thus
00311       // expects matrices to be column major.  We side with their
00312       // format for compatiblity, and accordingly, it is more cache
00313       // friendly to store data points along columns, as is common in
00314       // statistics, than along rows, as is more conventional.
00315       Vector query_point;
00316       queries_.MakeColumnVector(query_index, &query_point);
00317 
00318       // It's not terrible form to leave TODO statements in code you
00319       // intend to maintain, especially when coding under a deadline.
00320       // These are easy to search for, though for some reason, Garry
00321       // was more partial to "where's WALDO".  More memorable, maybe?
00322 
00323       /* TODO: try pruning query points vs reference node */
00324 
00325       for (index_t reference_index = reference_node->begin();
00326           reference_index < reference_node->end(); reference_index++) {
00327 
00328         Vector reference_point;
00329         references_.MakeColumnVector(reference_index, &reference_point);
00330         if (likely(reference_node != query_node ||
00331                       reference_index != query_index)) {
00332           // BLAS can perform many vectors ops more quickly than C/C++.
00333           double distance =
00334               la::DistanceSqEuclidean(query_point, reference_point);
00335 
00336                 /* Record points found to be closer than the best so far */
00337           if (distance < neighbor_distances_[query_index]) {
00338             neighbor_distances_[query_index] = distance;
00339             neighbor_indices_[query_index] = reference_index;
00340           }
00341         }
00342       } /* for reference_index */
00343 
00344       /* Find the upper bound nn distance for this node */
00345       if (neighbor_distances_[query_index] > max_nearest_neighbor_distance) {
00346         max_nearest_neighbor_distance = neighbor_distances_[query_index];
00347       }
00348 
00349     } /* for query_index */
00350 
00351     /* Update the upper bound nn distance for the node */
00352     query_node->stat().set_max_distance_so_far(max_nearest_neighbor_distance);
00353 
00354   } /* GNPBaseCase_ */
00355 
00356 
00361   void GNPRecursion_(TreeType* query_node, TreeType* reference_node,
00362                      double lower_bound_distance) {
00363 
00364     /* Make sure we didn't try to split children */
00365     DEBUG_ASSERT(query_node != NULL);
00366     DEBUG_ASSERT(reference_node != NULL);
00367 
00368     // The following asserts equality of two doubles and prints their
00369     // values if it fails.  Note that this *isn't* a particularly fast
00370     // debug check, though; it negates the benefit of passing ahead a
00371     // precomputed distance entirely.  That's why we have --mode=fast.
00372 
00373     /* Make sure the precomputed bounding information is correct */
00374     DEBUG_SAME_DOUBLE(lower_bound_distance,
00375         MinNodeDistSq_(query_node, reference_node));
00376 
00377     if (lower_bound_distance > query_node->stat().max_distance_so_far()) {
00378 
00379       /*
00380        * A reference node with lower-bound distance greater than this
00381        * query node's upper-bound nearest neighbor distance cannot
00382        * contribute a reference closer than any of the queries'
00383        * current neighbors, hence prune
00384        */
00385       number_of_prunes_++;
00386 
00387     } else if (query_node->is_leaf() && reference_node->is_leaf()) {
00388 
00389       /* Cannot further split leaves, so process exhaustively */
00390       GNPBaseCase_(query_node, reference_node);
00391 
00392     } else if (query_node->is_leaf()) {
00393 
00394       /* Query node's a leaf, but we can split references */
00395       double left_distance =
00396           MinNodeDistSq_(query_node, reference_node->left());
00397       double right_distance =
00398           MinNodeDistSq_(query_node, reference_node->right());
00399 
00400       /*
00401        * Nearer reference node more likely to contribute neighbors
00402        * (and thus tighten bounds), so visit it first
00403        */
00404       if (left_distance < right_distance) {
00405         // Prefetching directives
00406 #ifdef PREFETCH
00407         if (reference_node->stat().left_usage() > advice_limit_lower_
00408             && reference_node->stat().left_usage() < advice_limit_upper_) {
00409           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->left(), 
00410               reference_node->stat().left_usage(), MADV_SEQUENTIAL);
00411           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->right(), 
00412               reference_node->stat().right_usage(), MADV_DONTNEED);
00413         }
00414 #endif
00415       // GNP part
00416         GNPRecursion_(query_node, reference_node->left(), left_distance);
00417         // Prefetching directives
00418 #ifdef PREFETCH
00419         if (reference_node->stat().right_usage() > advice_limit_lower_
00420           && reference_node->stat().right_usage() < advice_limit_upper_) {
00421           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->right(),
00422               reference_node->stat().right_usage(), MADV_SEQUENTIAL);
00423           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->left(),
00424               reference_node->stat().left_usage(), MADV_DONTNEED);
00425         }
00426 #endif        
00427         // GNP part
00428         GNPRecursion_(query_node, reference_node->right(), right_distance);
00429       } else {
00430         // Prefetching directives
00431 #ifdef PREFETCH
00432         if (reference_node->stat().right_usage()  > advice_limit_lower_
00433             && reference_node->stat().right_usage() < advice_limit_upper_) {
00434           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->right(),
00435               reference_node->stat().right_usage(), MADV_SEQUENTIAL);
00436           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->left(),
00437               reference_node->stat().left_usage(), MADV_DONTNEED);     
00438         }
00439 #endif
00440         // GNP part
00441         GNPRecursion_(query_node, reference_node->right(), right_distance);
00442         // Prefetching directives
00443 #ifdef PREFETCH
00444         if (reference_node->stat().left_usage() > advice_limit_lower_
00445             && reference_node->stat().left_usage() < advice_limit_upper_) {
00446           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->left(), 
00447               reference_node->stat().left_usage(), MADV_SEQUENTIAL);
00448           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->right(), 
00449               reference_node->stat().right_usage(), MADV_DONTNEED);       
00450         }
00451 #endif
00452         // GNP part
00453         GNPRecursion_(query_node, reference_node->left(), left_distance);
00454       }
00455 
00456     } else if (reference_node->is_leaf()) {
00457 
00458       /* Reference node's a leaf, but we can split queries */
00459       double left_distance =
00460           MinNodeDistSq_(query_node->left(), reference_node);
00461       double right_distance =
00462           MinNodeDistSq_(query_node->right(), reference_node);
00463 
00464       /* Order of recursion does not matter */
00465       // Prefetching directives
00466 
00467 #ifdef PREFETCH
00468       if (query_node->stat().left_usage() > advice_limit_lower_
00469           && query_node->stat().left_usage() < advice_limit_upper_) {
00470         mmapmm::MemoryManager<false>::allocator_->Advise(query_node->left(), 
00471             query_node->stat().left_usage(), MADV_SEQUENTIAL);
00472         mmapmm::MemoryManager<false>::allocator_->Advise(query_node->right(), 
00473             query_node->stat().right_usage(), MADV_DONTNEED);    
00474       }
00475 #endif
00476       // GNP part 
00477       GNPRecursion_(query_node->left(), reference_node, left_distance);
00478 
00479 #ifdef PREFETCH
00480       if (query_node->stat().right_usage() > advice_limit_lower_
00481           && query_node->stat().right_usage() < advice_limit_upper_) {
00482         mmapmm::MemoryManager<false>::allocator_->Advise(query_node->right(), 
00483             query_node->stat().right_usage(), MADV_SEQUENTIAL);
00484         mmapmm::MemoryManager<false>::allocator_->Advise(query_node->left(), 
00485             query_node->stat().left_usage(), MADV_DONTNEED);        
00486       }
00487 #endif
00488       GNPRecursion_(query_node->right(), reference_node, right_distance);
00489 
00490       /* Update upper bound nn distance base new child bounds */
00491       query_node->stat().set_max_distance_so_far(
00492           max(query_node->left()->stat().max_distance_so_far(),
00493               query_node->right()->stat().max_distance_so_far()));
00494 
00495     } else {
00496 
00497       /*
00498        * Neither node is a leaf, so split both
00499        *
00500        * The order we process the query node's children doesn't
00501        * matter, but for each we should visit their nearer reference
00502        * node first.
00503        */
00504 
00505       double left_distance =
00506           MinNodeDistSq_(query_node->left(), reference_node->left());
00507       double right_distance =
00508           MinNodeDistSq_(query_node->left(), reference_node->right());
00509 
00510       if (left_distance < right_distance) {
00511         // Prefetching directives
00512 #ifdef PREFETCH
00513         if (query_node->stat().left_usage() > advice_limit_lower_ 
00514             && query_node->stat().left_usage() < advice_limit_upper_) {
00515           mmapmm::MemoryManager<false>::allocator_->Advise(query_node->left(), 
00516               query_node->stat().left_usage(), MADV_SEQUENTIAL);
00517           mmapmm::MemoryManager<false>::allocator_->Advise(query_node->right(), 
00518               query_node->stat().right_usage(), MADV_DONTNEED);    
00519         }
00520         if (reference_node->stat().left_usage() > advice_limit_lower_
00521             && reference_node->stat().left_usage() < advice_limit_upper_) {
00522           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->left(), 
00523               reference_node->stat().left_usage(), MADV_SEQUENTIAL);
00524           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->right(), 
00525               reference_node->stat().right_usage(), MADV_DONTNEED);    
00526         }
00527 #endif
00528         // GNP part        
00529         GNPRecursion_(query_node->left(),
00530             reference_node->left(), left_distance);
00531         // Prefetching directives
00532 
00533 #ifdef PREFETCH
00534         if (query_node->stat().left_usage() > advice_limit_lower_
00535             && query_node->stat().left_usage() < advice_limit_upper_) {
00536           mmapmm::MemoryManager<false>::allocator_->Advise(query_node->left(), 
00537               query_node->stat().left_usage(), MADV_SEQUENTIAL);
00538           mmapmm::MemoryManager<false>::allocator_->Advise(query_node->right(), 
00539               query_node->stat().right_usage(), MADV_DONTNEED);    
00540         }
00541         if (reference_node->stat().right_usage() > advice_limit_lower_
00542             && reference_node->stat().right_usage() < advice_limit_upper_) {
00543           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->right(), 
00544               reference_node->stat().right_usage(), MADV_SEQUENTIAL);
00545           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->left(), 
00546               reference_node->stat().left_usage(), MADV_DONTNEED);    
00547         }
00548 #endif
00549         // GNP part         
00550         GNPRecursion_(query_node->left(),
00551             reference_node->right(), right_distance);
00552       } else {
00553         // Prefetching directives
00554 #ifdef PREFETCH
00555         if (query_node->stat().left_usage() > advice_limit_lower_
00556             && query_node->stat().left_usage() < advice_limit_upper_) {
00557           mmapmm::MemoryManager<false>::allocator_->Advise(query_node->left(), 
00558               query_node->stat().left_usage(), MADV_SEQUENTIAL);
00559           mmapmm::MemoryManager<false>::allocator_->Advise(query_node->right(), 
00560               query_node->stat().right_usage(), MADV_DONTNEED);    
00561         }
00562         if (reference_node->stat().right_usage() > advice_limit_lower_
00563             && reference_node->stat().right_usage() < advice_limit_upper_) {
00564           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->right(), 
00565               reference_node->stat().right_usage(), MADV_SEQUENTIAL);
00566           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->left(), 
00567               reference_node->stat().left_usage(), MADV_DONTNEED);    
00568         }
00569 #endif
00570         // GNP part          
00571         GNPRecursion_(query_node->left(),
00572             reference_node->right(), right_distance);
00573         // Prefetching directives
00574 #ifdef PREFETCH
00575         if (query_node->stat().left_usage() > advice_limit_lower_
00576             && query_node->stat().left_usage() < advice_limit_upper_) {
00577           mmapmm::MemoryManager<false>::allocator_->Advise(query_node->left(), 
00578               query_node->stat().left_usage(), MADV_SEQUENTIAL);
00579           mmapmm::MemoryManager<false>::allocator_->Advise(query_node->right(), 
00580               query_node->stat().right_usage(), MADV_DONTNEED);    
00581         }
00582         if (reference_node->stat().left_usage() > advice_limit_lower_
00583             && reference_node->stat().left_usage() < advice_limit_upper_) {
00584           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->left(), 
00585               reference_node->stat().left_usage(), MADV_SEQUENTIAL);
00586           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->right(), 
00587               reference_node->stat().right_usage(), MADV_DONTNEED);    
00588         }
00589 #endif
00590         // GNP part         
00591         GNPRecursion_(query_node->left(),
00592             reference_node->left(), left_distance);
00593       }
00594 
00595       left_distance =
00596           MinNodeDistSq_(query_node->right(), reference_node->left());
00597       right_distance =
00598           MinNodeDistSq_(query_node->right(), reference_node->right());
00599 
00600       if (left_distance < right_distance) {
00601         GNPRecursion_(query_node->right(),
00602             reference_node->left(), left_distance);
00603         GNPRecursion_(query_node->right(),
00604             reference_node->right(), right_distance);
00605       } else {
00606         // Prefetching directives
00607 
00608 #ifdef PREFETCH
00609         if (query_node->stat().right_usage() > advice_limit_lower_
00610             && query_node->stat().right_usage() < advice_limit_upper_) {
00611           mmapmm::MemoryManager<false>::allocator_->Advise(query_node->right(), 
00612               query_node->stat().right_usage(), MADV_SEQUENTIAL);
00613           mmapmm::MemoryManager<false>::allocator_->Advise(query_node->left(), 
00614               query_node->stat().left_usage(), MADV_DONTNEED);    
00615         }
00616         if (reference_node->stat().right_usage() > advice_limit_lower_
00617             && reference_node->stat().right_usage() < advice_limit_upper_) {
00618           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->right(), 
00619               reference_node->stat().right_usage(), MADV_SEQUENTIAL);
00620           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->left(), 
00621               reference_node->stat().left_usage(), MADV_DONTNEED);    
00622         }
00623 #endif
00624         // GNP part         
00625         GNPRecursion_(query_node->right(),
00626             reference_node->right(), right_distance);
00627         // Prefetching directives
00628 
00629 #ifdef PREFETCH
00630         if (query_node->stat().right_usage() > advice_limit_lower_
00631             && query_node->stat().right_usage() < advice_limit_upper_) {
00632           mmapmm::MemoryManager<false>::allocator_->Advise(query_node->right(), 
00633               query_node->stat().right_usage(), MADV_SEQUENTIAL);
00634           mmapmm::MemoryManager<false>::allocator_->Advise(query_node->left(), 
00635               query_node->stat().left_usage(), MADV_DONTNEED);    
00636         }
00637         if (reference_node->stat().left_usage() > advice_limit_lower_
00638             && reference_node->stat().left_usage() < advice_limit_upper_) {
00639           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->left(), 
00640               reference_node->stat().left_usage(), MADV_SEQUENTIAL);
00641           mmapmm::MemoryManager<false>::allocator_->Advise(reference_node->right(), 
00642               reference_node->stat().right_usage(), MADV_DONTNEED);    
00643         }
00644 #endif
00645         // GNP part          
00646         GNPRecursion_(query_node->right(),
00647             reference_node->left(), left_distance);
00648       }
00649 
00650       /* Update upper bound nn distance base new child bounds */
00651       query_node->stat().set_max_distance_so_far(
00652           max(query_node->left()->stat().max_distance_so_far(),
00653               query_node->right()->stat().max_distance_so_far()));
00654 
00655     }
00656 
00657   } /* GNPRecursion_ */
00658 
00660 
00661   // Note that we initialize with const references below to keep from
00662   // copying data until we want to.  By the way, which side you put
00663   // the &'s and *'s on is on the level of deep-seated religious
00664   // belief: some people get real angry if you defy them, but you're
00665   // really no worse a person either way.  The compiler is agnostic.
00666 
00670   void Init(const Matrix& queries_in, const Matrix& references_in,
00671             struct datanode* module_in) {
00672     if (queries_in.ptr()==references_in.ptr()) {
00673       FATAL("Data matrices for query tree and reference tree should be different");
00674     }
00675     // It's a good idea to make sure the object isn't initialized a
00676     // second time, as this is almost certainly mistaken.
00677     DEBUG_ASSERT(initialized_ == false);
00678     DEBUG_ONLY(initialized_ = true);
00679 
00680     module_ = module_in;
00681 
00682     /* The data sets need to have the same dimensionality */
00683     DEBUG_SAME_SIZE(queries_.n_rows(), references_.n_rows());
00684 
00685     /* Alias input matrices although they will be rearranged 
00686      * They are too big to copy*/
00687     queries_.Alias(queries_in);
00688     references_.Alias(references_in);
00689 
00690     leaf_size_ = fx_param_int(module_, "leaf_size", 20);
00691     DEBUG_ASSERT(leaf_size_ > 0);
00692 
00693     advice_limit_upper_ = fx_param_int(module_, "advice_limit_upper", 67108864);
00694     DEBUG_ASSERT(advice_limit_upper_ > 0);
00695 
00696     advice_limit_lower_ = fx_param_int(module_, "advice_limit", 409600);
00697     DEBUG_ASSERT(advice_limit_lower_ > 0);
00698     
00699     std::string splits = fx_param_str(module_, "splits", "midpoint"); 
00700     
00701     // Timers are another handy tool provided by FASTexec.  These are
00702     // emitted automatically once you call fx_done.
00703     fx_timer_start(module_, "tree_building");
00704 
00705     // Input matrices are rearranged to an in-order traversal of
00706     // either tree.  To help in iterpretting results, the third
00707     // argument is Init'd to a mapping from rearranged indices to the
00708     // original order.  The fourth argument, if provided, would
00709     // initialize the reverse of said.
00710 
00711     /* Build the trees */
00712     if (splits==std::string("midpoint")) {
00713       query_tree_ = tree_mmap::MakeKdTreeMidpoint<TreeType>(
00714             queries_, leaf_size_, &old_from_new_queries_, NULL);
00715       reference_tree_ = tree_mmap::MakeKdTreeMidpoint<TreeType>(
00716           references_, leaf_size_, &old_from_new_references_, NULL);
00717     }
00718     if (splits==std::string("median")) {
00719       query_tree_ = tree_mmap::MakeKdTreeMedian<TreeType>(
00720             queries_, leaf_size_, &old_from_new_queries_, NULL);
00721       reference_tree_ = tree_mmap::MakeKdTreeMedian<TreeType>(
00722           references_, leaf_size_, &old_from_new_references_, NULL);
00723     }
00724 
00725     // While we don't make use of this here, it is possible to start
00726     // timers after stopping them.  They continue where they left off.
00727     fx_timer_stop(module_, "tree_building");
00728 
00729     /* Ready the list of nearest neighbor candidates to be filled. */
00730     neighbor_indices_.StaticInit(queries_.n_cols());
00731 
00732     /* Ready the vector of upper bound nn distances for use. */
00733     neighbor_distances_.StaticInit(queries_.n_cols());
00734     neighbor_distances_.SetAll(DBL_MAX);
00735 
00736     number_of_prunes_ = 0;
00737 
00738   } /* Init */
00739 
00740   void Init(const Matrix& references_in, fx_module* module_in) {
00741 
00742     // It's a good idea to make sure the object isn't initialized a
00743     // second time, as this is almost certainly mistaken.
00744     DEBUG_ASSERT(initialized_ == false);
00745     DEBUG_ONLY(initialized_ = true);
00746 
00747     module_ = module_in;
00748 
00749     /* Alias input matrices although they will be rearranged 
00750      * They are too big to copy*/
00751     references_.Alias(references_in);
00752     queries_.Alias(references_in);
00753     leaf_size_ = fx_param_int(module_, "leaf_size", 20);
00754     DEBUG_ASSERT(leaf_size_ > 0);
00755 
00756     advice_limit_upper_ = fx_param_int(module_, "advice_limit_upper", 67108864);
00757     DEBUG_ASSERT(advice_limit_upper_ > 0);
00758 
00759     advice_limit_lower_ = fx_param_int(module_, "advice_limit_lower", 409600);
00760     DEBUG_ASSERT(advice_limit_lower_ > 0);
00761 
00762     std::string splits = fx_param_str(module_, "splits", "midpoint"); 
00763 
00764     // Timers are another handy tool provided by FASTexec.  These are
00765     // emitted automatically once you call fx_done.
00766     fx_timer_start(module_, "tree_building");
00767 
00768     // Input matrices are rearranged to an in-order traversal of
00769     // either tree.  To help in iterpretting results, the third
00770     // argument is Init'd to a mapping from rearranged indices to the
00771     // original order.  The fourth argument, if provided, would
00772     // initialize the reverse of said.
00773 
00774     /* Build the trees */
00775     if (splits==std::string("midpoint")) {
00776       query_tree_ = tree_mmap::MakeKdTreeMidpoint<TreeType>(
00777                 queries_, leaf_size_, &old_from_new_queries_, NULL);
00778     }
00779     if (splits==std::string("median")) {
00780       query_tree_ = tree_mmap::MakeKdTreeMedian<TreeType>(
00781                 queries_, leaf_size_, &old_from_new_queries_, NULL);
00782     }
00783 
00784     reference_tree_ = query_tree_; 
00785     old_from_new_references_.Alias(old_from_new_queries_);
00786 
00787     // While we don't make use of this here, it is possible to start
00788     // timers after stopping them.  They continue where they left off.
00789     fx_timer_stop(module_, "tree_building");
00790 
00791     /* Ready the list of nearest neighbor candidates to be filled. */
00792     neighbor_indices_.StaticInit(queries_.n_cols());
00793 
00794     /* Ready the vector of upper bound nn distances for use. */
00795     neighbor_distances_.StaticInit(queries_.n_cols());
00796     neighbor_distances_.SetAll(DBL_MAX);
00797 
00798     number_of_prunes_ = 0;
00799 
00800   } /* Init */
00801 
00802 
00808   void InitNaive(const Matrix& queries_in, const Matrix& references_in,
00809                  fx_module* module_in){
00810 
00811     DEBUG_ASSERT(initialized_ == false);
00812     DEBUG_ONLY(initialized_ = true);
00813 
00814     module_ = module_in;
00815 
00816     /* The data sets need to have the same dimensionality */
00817     DEBUG_SAME_SIZE(queries_.n_rows(), references_.n_rows());
00818 
00819     /* Alias matrices they are too big to copy */
00820     queries_.Alias(queries_in);
00821     references_.Alias(references_in);
00822 
00823     /*
00824      * A bit of a trick so we can still use BaseCase_: we'll expand
00825      * the leaf size so that our trees only have one node.
00826      */
00827     leaf_size_ = max(queries_.n_cols(), references_.n_cols());
00828 
00829     /* Build the (single node) trees */
00830     query_tree_ = tree_mmap::MakeKdTreeMidpoint<TreeType>(
00831               queries_, leaf_size_, &old_from_new_queries_, NULL);
00832     reference_tree_ = tree_mmap::MakeKdTreeMidpoint<TreeType>(
00833         references_, leaf_size_, &old_from_new_references_, NULL);
00834 
00835     /* Ready the list of nearest neighbor candidates to be filled. */
00836     neighbor_indices_.StaticInit(queries_.n_cols());
00837 
00838     /* Ready the vector of upper bound nn distances for use. */
00839     neighbor_distances_.StaticInit(queries_.n_cols());
00840     neighbor_distances_.SetAll(DBL_MAX);
00841 
00842     number_of_prunes_ = 0;
00843 
00844   } /* InitNaive */
00845 
00851   void InitNaive(const Matrix& references_in,
00852                  fx_module* module_in){
00853 
00854     DEBUG_ASSERT(initialized_ == false);
00855     DEBUG_ONLY(initialized_ = true);
00856 
00857     module_ = module_in;
00858 
00859     /* The data sets need to have the same dimensionality */
00860     DEBUG_SAME_SIZE(queries_.n_rows(), references_.n_rows());
00861 
00862     /* Alias matrices they are too big to copy */
00863     queries_.Alias(references_in);
00864     references_.Alias(references_in);
00865 
00866     /*
00867      * A bit of a trick so we can still use BaseCase_: we'll expand
00868      * the leaf size so that our trees only have one node.
00869      */
00870     leaf_size_ = max(queries_.n_cols(), references_.n_cols());
00871 
00872     /* Build the (single node) trees */
00873     query_tree_ = tree_mmap::MakeKdTreeMidpoint<TreeType>(
00874               queries_, leaf_size_, &old_from_new_queries_, NULL);
00875     reference_tree_ =  query_tree_;
00876     old_from_new_references_.Alias(old_from_new_queries_);
00877     /* Ready the list of nearest neighbor candidates to be filled. */
00878     neighbor_indices_.StaticInit(queries_.n_cols());
00879 
00880     /* Ready the vector of upper bound nn distances for use. */
00881     neighbor_distances_.StaticInit(queries_.n_cols());
00882     neighbor_distances_.SetAll(DBL_MAX);
00883 
00884     number_of_prunes_ = 0;
00885 
00886   } /* InitNaive */
00887 
00888 
00893   void ComputeNeighbors(GenVector<index_t>* results, GenVector<double>* distances) {
00894 
00895     // In addition to confirming the object's been initialized, we
00896     // want to make sure we aren't asking it to compute a second time.
00897     DEBUG_ASSERT(initialized_ == true);
00898     DEBUG_ASSERT(already_used_ == false);
00899     DEBUG_ONLY(already_used_ = true);
00900 
00901     fx_timer_start(module_, "dual_tree_computation");
00902 
00903     /* Start recursion on the roots of either tree */
00904     GNPRecursion_(query_tree_, reference_tree_,
00905         MinNodeDistSq_(query_tree_, reference_tree_));
00906 
00907     fx_timer_stop(module_, "dual_tree_computation");
00908 
00909     // Save the total number of prunes to the FASTexec module; this
00910     // will printed after calling fx_done or can be read back later.
00911     fx_result_int(module_, "number_of_prunes", number_of_prunes_);
00912 
00913     if (results) {
00914       EmitResults(results, distances);
00915     }
00916 
00917   } /* ComputeNeighbors */
00918 
00919 
00923   void ComputeNaive(GenVector<index_t>* results, GenVector<double>* distances) {
00924 
00925     DEBUG_ASSERT(initialized_ == true);
00926     DEBUG_ASSERT(already_used_ == false);
00927     DEBUG_ONLY(already_used_ = true);
00928 
00929     fx_timer_start(module_, "naive_time");
00930 
00931     /* BaseCase_ on the roots is equivalent to naive */
00932     GNPBaseCase_(query_tree_, reference_tree_);
00933 
00934     fx_timer_stop(module_, "naive_time");
00935 
00936     if (results) {
00937       EmitResults(results, distances);
00938     }
00939 
00940   } /* ComputeNaive */
00941 
00945   void EmitResults(GenVector<index_t>* results, GenVector<double>* distances) {
00946 
00947     DEBUG_ASSERT(initialized_ == true);
00948 
00949     results->StaticInit(neighbor_indices_.length());
00950     distances->StaticInit(neighbor_distances_.length());
00951 
00952     /* Map the indices back from how they have been permuted. */
00953     for (index_t i = 0; i < neighbor_indices_.length(); i++) {
00954       (*results)[old_from_new_queries_[i]] =
00955           old_from_new_references_[neighbor_indices_[i]];
00956       (*distances)[
00957         old_from_new_references_[i]] = neighbor_distances_[i];
00958     }
00959 
00960   } /* EmitResults */
00961 
00962 }; /* class DiskAllNN */
00963 
00964 #endif 
Generated on Mon Jan 24 12:04:37 2011 for FASTlib by  doxygen 1.6.3