sort_s.h

00001 /* -*- mode:C++; c-basic-offset:4 -*-
00002      Shore-MT -- Multi-threaded port of the SHORE storage manager
00003    
00004                        Copyright (c) 2007-2009
00005       Data Intensive Applications and Systems Labaratory (DIAS)
00006                Ecole Polytechnique Federale de Lausanne
00007    
00008                          All Rights Reserved.
00009    
00010    Permission to use, copy, modify and distribute this software and
00011    its documentation is hereby granted, provided that both the
00012    copyright notice and this permission notice appear in all copies of
00013    the software, derivative works or modified versions, and any
00014    portions thereof, and that both notices appear in supporting
00015    documentation.
00016    
00017    This code is distributed in the hope that it will be useful, but
00018    WITHOUT ANY WARRANTY; without even the implied warranty of
00019    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS
00020    DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
00021    RESULTING FROM THE USE OF THIS SOFTWARE.
00022 */
00023 
00024 /*<std-header orig-src='shore' incl-file-exclusion='SORT_S_H'>
00025 
00026  $Id: sort_s.h,v 1.34 2010/07/01 00:08:22 nhall Exp $
00027 
00028 SHORE -- Scalable Heterogeneous Object REpository
00029 
00030 Copyright (c) 1994-99 Computer Sciences Department, University of
00031                       Wisconsin -- Madison
00032 All Rights Reserved.
00033 
00034 Permission to use, copy, modify and distribute this software and its
00035 documentation is hereby granted, provided that both the copyright
00036 notice and this permission notice appear in all copies of the
00037 software, derivative works or modified versions, and any portions
00038 thereof, and that both notices appear in supporting documentation.
00039 
00040 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY
00041 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS
00042 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND
00043 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
00044 
00045 This software was developed with support by the Advanced Research
00046 Project Agency, ARPA order number 018 (formerly 8230), monitored by
00047 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518.
00048 Further funding for this work was provided by DARPA through
00049 Rome Research Laboratory Contract No. F30602-97-2-0247.
00050 
00051 */
00052 
00053 #ifndef SORT_S_H
00054 #define SORT_S_H
00055 
00056 #include "w_defines.h"
00057 
00058 /*  -- do not edit anything above this line --   </std-header>*/
00059 
00060 #ifdef __GNUG__
00061 #pragma interface
00062 #endif
00063 
00064 // forward
00065 class file_p;
00066 class record_t;
00067 
00068 #define OLDSORT_COMPATIBILITY
00069 
00070 /**\brief A namespace to contain certain types used for ss_m::sort_file. */
00071 namespace ssm_sort {
00072 
00073 /**\brief Descriptor for sort key, used with sort_stream_i
00074  *
00075  * This structure was used in the original implementation
00076  * of sort_file (now deprecated).  Unfortunately, the
00077  * API for sort_stream_i has not been updated to use the
00078  * alternative key descriptor structure, sort_keys_t.
00079  */
00080 struct key_info_t {
00081     typedef sortorder::keytype key_type_t;
00082 
00083     /**\cond skip */
00084     /// for backward compatibility only: should use keytype henceforth
00085     enum dummy_t {  t_char=sortorder::kt_u1,
00086                     t_int=sortorder::kt_i4,
00087                     t_float=sortorder::kt_f4,
00088                     t_string=sortorder::kt_b, 
00089                     t_spatial=sortorder::kt_spatial};
00090     /**\endcond skip */
00091 
00092     key_type_t  type;       // key type
00093     /// For sorting on spatial keys
00094     nbox_t      universe;   
00095     /**\brief Is a derived key. 
00096      * If true, the key must be the only item in rec
00097      *  header, and the header will not be copied to
00098      *  the result record (allows the user to store derived
00099      *  key in the header temporarily, for sorting purposes).
00100      */
00101     bool        derived;    
00102 
00103     /**\enum where_t
00104      **\brief Describes where the key is found: in a record header or body.
00105      * For file sort. 
00106      *
00107      * - t_hdr  : key is in the record header
00108      * - t_body  : key is in the record body
00109      */
00110     enum where_t { t_hdr = 0, t_body };
00111     /**\brief Where the key resides: header or body */
00112     where_t              where;      
00113     /**\brief Offset from beginning of header or body where the key starts */
00114     w_base_t::uint4_t    offset;     
00115     /**\brief Length of key */
00116     w_base_t::uint4_t    len;        
00117     /**\brief Estimated record length. A hint. */
00118     w_base_t::uint4_t    est_reclen; 
00119     
00120     key_info_t() {
00121       type = sortorder::kt_i4;
00122       where = t_body;
00123       derived = false;
00124       offset = 0;
00125       len = sizeof(int);
00126       est_reclen = 0;
00127     }
00128 };
00129 
00130 /**\brief Behavioral options for sort_stream_i.
00131  *
00132  * This structure was used in the original implementation
00133  * of sort_file (now deprecated).  Unfortunately, the
00134  * API for sort_stream_i has not been updated to use the
00135  * alternative behavioral options structure, sort_keys_t.
00136  */
00137 struct sort_parm_t {
00138     /// Number of pages for each run.
00139     uint2_t run_size;        
00140     /// Volume on which scratch files will reside.
00141     vid_t   vol;        
00142     /// True -> duplicates will be removed. 
00143     bool   unique;        
00144     /// Sort in ascending order?
00145     bool   ascending;        
00146     /// Destroy the input file when done?
00147     bool   destructive;    
00148 
00149     /// Logging level of scratch files
00150     smlevel_3::sm_store_property_t property; 
00151 
00152     sort_parm_t() : run_size(10), unique(false), ascending(true),
00153             destructive(false), 
00154             property(smlevel_3::t_regular) {}
00155 };
00156 
00157 
00158 /*
00159  * For new sort:
00160  */
00161 
00162 /**\brief Input, output argument to CSKF, MOF, UMOF callbacks.
00163  * \ingroup SSMSORT
00164  * \details
00165  * Since some of the callbacks need variously an integer or a
00166  * pointer, this class
00167  * puns an integer and a void *.
00168  *
00169  * See ss_m::sort_file
00170  */
00171 class key_cookie_t {
00172     void *   c; // datum stored as a void*, punned by
00173                 // the extraction methods
00174 public:
00175     explicit key_cookie_t () : c(NULL) { }
00176 
00177     /// Create a cookie from an integer.
00178     explicit key_cookie_t (int i) {
00179         union { int _i; void *v; } _pun = {i};
00180         c = _pun.v;
00181     }
00182     /// Create a cookie from a pointer.
00183     explicit key_cookie_t (void *v):c(v) { }
00184 
00185     /// Extract integer value.
00186     int make_int() const { return operator int(); }
00187 
00188     /// Extract smsize_t value.
00189     int make_smsize_t() const { return operator smsize_t(); }
00190 
00191     /// Extract ptr value.
00192     void *make_ptr() const { return c; }
00193 
00194     static key_cookie_t   null; // newsort.cpp
00195 
00196 private:
00197     operator int () const { 
00198          union { void *v; int _i; } _pun = {c};
00199              return _pun._i;
00200     }
00201 
00202     operator smsize_t () const { 
00203          union { void *v; int _i; } _pun = {c};
00204          smsize_t t =  _pun._i & 0xffffffff;
00205 #ifdef ARCH_LP64
00206          w_assert1((_pun._i & 0xffffffff00000000) == 0);
00207 #endif
00208          return t;
00209     }
00210 };
00211 
00212 /**\brief A memory allocator (abstract base class) used by ss_m::sort_file and its adjuncts.
00213  * \ingroup SSMSORT
00214  * \details
00215  * The ssm_sort::CSKF, ssm_sort::MOF and ssm_sort::UMOF callbacks  
00216  * may need to allocate memory, in which case they 
00217  * should use the factory passed in as an argument; this
00218  * allows the storage manager to free the memory using the same
00219  * allocator.
00220  *
00221  * These allocators are used with object_t (record handles) and 
00222  * skey_t (key handles).
00223  *
00224  * The default allocators used by ss_m::sort_file are 
00225  * - factory_t::none : does not allocate or free, used for statically-
00226  *   allocated space.
00227  * - factory_t::cpp_vector : a factory that allocates byte-strings that
00228  *   start on a double-aligned boundary and frees them. 
00229  *
00230  *  Users must write their own factories that inherit from
00231  *  factory_t.
00232  */
00233 class factory_t 
00234 {
00235    /* users are meant to write their own factories 
00236     * that inherit from this
00237     */
00238 public:
00239    factory_t();
00240    virtual    NORET ~factory_t();
00241 
00242    virtual    void* allocfunc(smsize_t)=0;
00243    virtual    void freefunc(const void *, smsize_t)=0;
00244 
00245    // none: causes no delete - used for statically allocated space
00246    static factory_t*    none;
00247 
00248    // cpp_vector - simply calls delete[] 
00249    static factory_t*    cpp_vector;
00250 
00251    void freefunc(vec_t&v) {
00252     for(int i=v.count()-1; i>=0; i--) {
00253         DBG(<<"freefuncVEC(ptr=" << (void*)v.ptr(i) << " len=" << v.len(i));
00254         freefunc((void *)v.ptr(i), v.len(i));
00255     }
00256    }
00257 };
00258 
00259 /* XXX move virtual functions to a .cpp */
00260 inline factory_t::factory_t() {}
00261 inline NORET factory_t::~factory_t() {}
00262 
00263 /**\brief Descriptor for fixed-location keys
00264  * \ingroup SSMSORT
00265  * \details
00266  * A sort_keys_t contains an instance of this for each key.
00267  */
00268 class key_location_t 
00269 {
00270 public:
00271     /// Key is in record header
00272     bool         _in_hdr;
00273     /// Offset from start of header/body of start of key
00274     smsize_t     _off;
00275     /// Length of key
00276     smsize_t     _length;
00277 
00278     key_location_t() : _in_hdr(false), _off(0), _length(0)  {}
00279 
00280     key_location_t(const key_location_t &old) : 
00281         _in_hdr(old._in_hdr), 
00282         _off(old._off), _length(old._length) {}
00283 
00284     /// Key is in record header
00285     bool is_in_hdr() const { return _in_hdr; }
00286 };
00287 
00288 class skey_t; // forward
00289 
00290 /**\brief Handle on a record in the buffer pool or in scratch memory.
00291  * \ingroup SSMSORT
00292  * \details
00293  * The ss_m::sort_file function calls a variety of user-defined 
00294  * callback functions to process records in the buffer pool
00295  * or in-memory copies of records and to produce more of the same. 
00296  *
00297  * This class provides a generic handle to reference the records or copies of 
00298  * records, since the callbacks need only to
00299  * - know the header length
00300  * - find the data at an offset in the header
00301  * - know the body length
00302  * - find the data at an offset in the body
00303  * - copy a generic handle
00304  * - copy out a portion of the header or body to a buffer
00305  * - create a new generic handle from a pair of buffer
00306  *
00307  * An object_t created from buffers must also know how the buffers were
00308  * allocated so it can free the buffers to the same heap, so it keeps
00309  * a reference to the factory_t used by the callback function
00310  * to create the buffers. 
00311  */
00312 class object_t : public smlevel_top 
00313 {
00314     friend class skey_t;
00315     static const object_t& none;
00316 protected:
00317     void        _construct(file_p&, slotid_t);
00318     void        _construct(
00319               const void *hdr, smsize_t hdrlen, factory_t *,
00320               const void *body, smsize_t bodylen, factory_t *);
00321 
00322     void      _replace(const object_t&); // copy over
00323     NORET     object_t();
00324 public:
00325     NORET     object_t(const object_t&o) 
00326                :
00327                _valid(false),
00328                _in_bp(false),
00329                _rec(0),
00330                _fp(0),
00331                _hdrfact(factory_t::none),
00332                _hdrlen(0),
00333                _hdrbuf(0),
00334                _bodyfact(factory_t::none),
00335                _bodylen(0),
00336                _bodybuf(0)
00337             { _replace(o); }
00338 
00339     NORET     object_t(
00340               const void *hdr, 
00341               smsize_t hdrlen, 
00342               factory_t& hf,
00343               const void *body, 
00344               smsize_t bodylen, 
00345               factory_t& bf) 
00346                :
00347                _valid(false),
00348                _in_bp(false),
00349                _rec(0),
00350                _fp(0),
00351                _hdrfact(&hf),
00352                _hdrlen(hdrlen),
00353                _hdrbuf(hdr),
00354                _bodyfact(&bf),
00355                _bodylen(bodylen),
00356                _bodybuf(body)
00357                { }
00358 
00359     NORET     ~object_t();
00360 
00361     bool        is_valid() const  { return _valid; }
00362     bool        is_in_buffer_pool() const { return is_valid() && _in_bp; }
00363     smsize_t    hdr_size() const { return _hdrlen; }
00364     smsize_t    body_size() const { return _bodylen; }
00365 
00366     const void *hdr(smsize_t offset) const; 
00367     const void *body(smsize_t offset) const;
00368     void        freespace();
00369     void        assert_nobuffers() const;
00370     smsize_t    contig_body_size() const; // pinned amt
00371 
00372     w_rc_t      copy_out(
00373                     bool in_hdr, 
00374                     smsize_t offset, 
00375                     smsize_t length, 
00376                     vec_t&dest) const;
00377 
00378 
00379 private:      // data
00380     bool        _valid;
00381     bool        _in_bp; // in buffer pool or in memory
00382 
00383     // for in_buffer_pool:
00384     record_t*    _rec;
00385     file_p*      _fp;
00386 
00387     // for in_memory:
00388     factory_t*    _hdrfact;
00389     smsize_t      _hdrlen;
00390     const void*   _hdrbuf;
00391 
00392     factory_t*    _bodyfact;
00393     smsize_t      _bodylen;
00394     const void*   _bodybuf;
00395 
00396 protected:
00397     int     _save_pin_count;
00398         // for callback
00399     void      _callback_prologue() const {
00400 #        if W_DEBUG_LEVEL > 2
00401             /*
00402              * leaving SM
00403              */
00404             // escape const-ness of the method
00405             int *save_pin_count = (int *)&_save_pin_count;
00406             *save_pin_count = me()->pin_count();
00407             w_assert3(_save_pin_count == 0);
00408             me()->check_pin_count(0);
00409             me()->in_sm(false);
00410 #        endif 
00411         }
00412     void      _callback_epilogue() const {
00413 #        if W_DEBUG_LEVEL > 2
00414             /*
00415              * re-entering SM
00416              */
00417             me()->check_actual_pin_count(_save_pin_count);
00418             me()->in_sm(true);
00419 #        endif 
00420         }
00421     void    _invalidate() {
00422                _valid=false;
00423                _in_bp=false;
00424                _rec = 0;
00425                _fp=0;
00426                _hdrfact=factory_t::none;
00427                _hdrlen=0;
00428                _hdrbuf=0;
00429                _bodyfact=factory_t::none;
00430                _bodylen=0;
00431                _bodybuf=0;
00432                _save_pin_count=0;
00433             }
00434 };
00435 
00436 class run_mgr; // forward
00437 
00438 /**\brief The result of a CSKF function.  
00439  * \ingroup SSMSORT
00440  * \details
00441  * This is a handle on a key.  
00442  * The key might reside in scratch memory or in the buffer pool.
00443  * This class provides a generic API for using such a key.
00444  *
00445  * The attributes are:
00446  * - pointer to start of key (method ptr())
00447  * - length of key (method size())
00448  * - length of contiguous portion of key (method contig_length())
00449  * - location (in buffer pool or scratch memory) (method is_in_obj())
00450  * - sub-location (in buffer pool: in record header or body) (method is_in_hdr())
00451  * - the heap allocator (factory_t) used to allocate buffers if this
00452  *   refers to a key in scratch memory.
00453  *
00454  *   Generally only the constructor is used by a CSKF callback function,
00455  *   and the idiom used (indeed, the only way it can 
00456  *   be populated) is placement new as follows:
00457  *   \code
00458 using namespace ssm_sort;
00459 w_rc_t 
00460 exampleCSKF(
00461     const rid_t&         rid,
00462     const object_t&      obj,
00463     key_cookie_t         ,  
00464     factory_t&           f,
00465     skey_t*              out
00466 )
00467 {
00468     {
00469     // this shows how to create an skey_t from the entire header of the
00470     // incoming object_t, whether the object is in the buffer pool or
00471     // in scratch memory:
00472     smsize_t length = obj.hdr_size();
00473     smsize_t offset = 0;
00474     bool     in_hdr = true;
00475     new(out) skey_t(obj, offset, length, in_hdr);
00476     }
00477 
00478     {
00479     // this shows how to create an skey_t from the last 3 bytes of the
00480     // body of the incoming object_t, whether the 
00481     // object is in the buffer pool or in scratch memory:
00482     smsize_t length = 3;
00483     smsize_t offset = obj.body_size() - length;
00484     bool     in_hdr = false;
00485     new(out) skey_t(obj, offset, length, in_hdr);
00486     }
00487 
00488     {
00489     // this shows how to create an skey_t derived from the record id
00490     int val = rid.pid.page;
00491     smsize_t length = sizeof(val);
00492     smsize_t offset = 0;
00493     void *buf = f.allocfunc(length);
00494     memcpy(&buf, &val, sizeof(val));
00495     new(out) skey_t(buf, offset, length, f);
00496     }
00497 
00498     return RCOK;
00499 }
00500 \endcode
00501  */
00502 class skey_t 
00503 {
00504     // To let run_mgr construct empty skey_t
00505     friend class ssm_sort::run_mgr;
00506 public:
00507     /**\brief Construct from a key in the buffer pool
00508      * @param[in]  o  structure describing the key location
00509      * @param[in]  offset  offset from record header or body where key starts
00510      * @param[in]  len  length of entire key
00511      * @param[in]  in_hdr  true if key is in record header, false if in body
00512      */
00513     NORET skey_t(
00514         const object_t& o, 
00515         smsize_t offset,
00516         smsize_t len, 
00517         bool in_hdr) 
00518 
00519         : _valid(true), _in_obj(true),
00520         _length(len),
00521         _obj(&o), _offset(offset),
00522         _in_hdr(in_hdr), 
00523         _fact(factory_t::none), _buf(0)
00524         {
00525         }
00526     /**\brief Construct from a key in scratch memory
00527      * @param[in]  buf  scratch memory buffer
00528      * @param[in]  offset  offset from buf where key starts
00529      * @param[in]  len  length of entire key
00530      * @param[in]  f  factory used to allocate the buffer
00531      */
00532     NORET skey_t(
00533         void *buf, 
00534         smsize_t offset,
00535         smsize_t len,  // KEY len, NOT NECESSARILY BUF len
00536         factory_t& f
00537         ) 
00538         : _valid(true), _in_obj(false),
00539         _length(len),
00540         _obj(&object_t::none),
00541         _offset(offset),
00542         _in_hdr(false), 
00543         _fact(&f), _buf(buf)
00544         {
00545         }
00546 
00547     NORET   ~skey_t() {}
00548 
00549     /// Length of entire key
00550     smsize_t  size() const     { return _length; }
00551 
00552     /// True if this structure is populated (points to a key)
00553     bool      is_valid() const { return _valid; }
00554 
00555     /// True if key is in the buffer pool
00556     bool      is_in_obj() const { return is_valid() && _in_obj; }
00557 
00558     /**\brief True unless this key is for an object other than \a o
00559      *
00560      * Used for assertions in sort code.
00561      */
00562     bool      consistent_with_object(const object_t&o ) const { 
00563                                  return ((_obj == &o) || !_in_obj); }
00564 
00565     /// True if key is in buffer pool and is in record header
00566     bool      is_in_hdr() const { return is_in_obj() && _in_hdr; }
00567 
00568     /// Pointer into byte at \a offset in key
00569     const void *ptr(smsize_t offset) const;  // key
00570 
00571     /// Using its factory, free space allocated for the in-scratch-memory buffer
00572     void      freespace();
00573 
00574     /// Asserts that no heap memory is held by this or its object_t.
00575     void      assert_nobuffers()const;
00576 
00577     /// Pinned amount of the key
00578     smsize_t  contig_length() const; 
00579 
00580     /// Copies key into vector (which must have pre-allocated space)
00581     w_rc_t    copy_out(vec_t&dest) const;
00582 
00583 private:
00584     bool        _valid; 
00585     bool        _in_obj; // else in mem
00586     smsize_t    _length;
00587 protected:
00588     // for in_obj case;
00589     const object_t*    _obj;     
00590     smsize_t    _offset; // into buf or object of start of key
00591 private:
00592     bool        _in_hdr;
00593     // for !in_obj but valid
00594     factory_t*    _fact;
00595     const void*    _buf;   // buf to be deallocated -- key
00596                 // might not start at offset 0
00597 protected:
00598     void      _invalidate();
00599     NORET     skey_t() : 
00600                 _valid(false), _in_obj(false), _length(0),
00601                 _obj(&object_t::none),
00602                 _offset(0), _in_hdr(false),
00603                 _fact(factory_t::none), _buf(0)
00604                 { }
00605     void    _construct(
00606                const object_t *o, smsize_t off, smsize_t l, bool h) {
00607                 _valid = true;
00608                 _in_obj = true;
00609                 _obj = o;
00610                 _offset = off;
00611                 _length = l;
00612                 _in_hdr = h;
00613                 _fact = factory_t::none;
00614                 }
00615     void      _construct(
00616             const void *buf, smsize_t off, smsize_t l, factory_t* f) {
00617                 _valid = true;
00618                 _obj = 0;
00619                 _in_obj = false;
00620                 _offset = off;
00621                 _length = l;
00622                 _fact = f;
00623                 _buf = buf;
00624                 }
00625     void    _replace(const skey_t&); // copy over
00626     void    _replace_relative_to_obj(const object_t &, const skey_t&); // copy over
00627 }; // skey_t
00628 
00629 
00630  /** 
00631  **\typedef  CSKF
00632  * \ingroup SSMSORT
00633  *\brief Create-Sort-Key Function.
00634  * @param[in] rid   Record ID of the record containing the key.
00635  * @param[in] in_obj  Handle (object_t) on the record whose key we need. 
00636  * @param[in] cookie  Describes the location and type of key in the source
00637  * record.
00638  * @param[in] f  Class for managing allocated space.
00639  * @param[out] out   Result is written to this object_t.  
00640  * \details
00641  *
00642  * This type of callback function fills in the skey_t \a out for
00643  *  a key in an object (record).  It is called only when the object 
00644  *  is first encountered in its run in a sort.
00645  *
00646  *  A set of predefined callbacks are provided, q.v.:
00647  *  - sort_keys_t::noCSKF  : vacuous - does nothing
00648  *  - sort_keys_t::generic_CSKF :  copies or lexifies a key based on its
00649  *      key_cookie_t argument.
00650  *
00651  *  See generic_CSKF_cookie for an example of a simple user-defined CSKF.
00652  *
00653  *  The skey_t \a out is pre-allocated and must be populated by 
00654  *  this callback function.
00655  *  The factory_t \a f may be used for this allocation, or a user-defined
00656  *  factory_t may be used instead. Whatever factory is used to allocate
00657  *  the buffer to hold a key, that same factory must be placed in
00658  *  the skey_t via the placement-new constructor call. Examples are
00659  *  given in the description for skey_t.
00660  *
00661  *  This callback must not free any space for \a in_obj.
00662  */  
00663 typedef w_rc_t (*CSKF)(
00664     const rid_t&        rid,
00665     const object_t&     in_obj,
00666     key_cookie_t        cookie,  // type info
00667     factory_t&          f,
00668     skey_t*             out
00669 );
00670 
00671 /**\typedef MOF
00672  * \ingroup SSMSORT
00673  *\brief  Marshal Object Function 
00674  * @param[in] rid  Record id of the source record to marshal
00675  * @param[in] obj_in Handle on the source record (object_t)
00676  * @param[in] cookie Describes location and type of key in the source record
00677  * @param[out] obj_out Result is to be written to this object_t.  
00678  *\details
00679  * The \a obj_out is already allocated; this function must populate it.
00680  * The \a obj_out has no buffers or factory associated with it. The
00681  * MOF must populate the \a obj_out from its own factory (it may use
00682  * factory_t::cpp_vector, which uses the global heap, or it may 
00683  * use the factory from the \a obj_in, but ONLY if that factory is
00684  * \b not factory_t::none.
00685  * The \a obj_out
00686  * object carries its factory along with it throughout the sort process.
00687  *
00688  * This callback must not free any space for \a obj_in.
00689  *
00690  *  This type of callback is used 
00691  *  - when a record is first encountered in the input file
00692  *  - if sort_keys_t::carry_obj() is true,
00693  *  it's also called when the object is read back
00694  *  from a temporary file.
00695  *
00696  *  A predefined callback is provided: 
00697  *  - ssm_sort::sort_keys_t::noMOF  : vacuous - does nothing
00698  *
00699  *  The sort code checks for 
00700  *  \code
00701  sort_keys_t::marshal_func() == sort_keys_t::noMOF
00702  \endcode
00703  * in which case, it does not call a marshal function at all. (This
00704  * is less work than allocating the output object_t to call a
00705  * vacuous function.)
00706  */
00707 typedef w_rc_t (*MOF)(
00708     const rid_t&       rid,  // record id
00709     const object_t&    obj_in,
00710     key_cookie_t       cookie,  // type info
00711                 // func must allocate obj_out,
00712     object_t*         obj_out     // SM allocates, MOF initializes, SM
00713                 // frees its buffers
00714     );
00715 
00716 /**\typedef UMOF
00717  * \ingroup SSMSORT
00718  * \brief Un-marshal Object Function 
00719  * @param[in] rid  Record id of the source record to marshal
00720  * @param[in] obj_in Handle on the source record (object_t)
00721  * @param[in] cookie Describes location and type of key in the destination 
00722  * record
00723  * @param[out] obj_out Result is to be written to this object_t.  
00724  *
00725  * \details
00726  *
00727  * The \a obj_out is already allocated; this function must populate it.
00728  * The \a obj_out has no buffers or factory associated with it. The
00729  * MOF must populate the \a obj_out from its own factory (it may use
00730  * factory_t::cpp_vector, which uses the global heap, or it may 
00731  * use the factory from the \a obj_in, but ONLY if that factory is
00732  * \b not factory_t::none.
00733  * The \a obj_out
00734  * object carries its factory along with it throughout the sort process.
00735  *
00736  * This callback must not free any space for \a obj_in.
00737  *
00738  *  This type of callback is used 
00739  *  - when an object is written to a temp file
00740  *  - when the final object is written to the result file (if the
00741  *  output is a copy of the input file).
00742  *
00743  *  A predefined callback is provided: 
00744  *  - ssm_sort::sort_keys_t::noUMOF  : vacuous - does nothing
00745  *
00746  *  The sort code checks for 
00747  *  \code
00748  sort_keys_t::unmarshal_func() == sort_keys_t::noUMOF
00749  \endcode
00750  * in which case, it does not call a marshal function at all. (This
00751  * is less work than allocating the output object_t to call a
00752  * vacuous function.)
00753  *
00754  */
00755 typedef w_rc_t (*UMOF)(
00756     const rid_t&       rid,  // orig record id of object in buffer
00757     const object_t&    obj_in,
00758     key_cookie_t       cookie,  // type info
00759     object_t*        obj_out // SM allocates, UMOF initializes,
00760             // SM will copy to disk, free mem
00761     );
00762 
00763 /**\typedef CF
00764  * \ingroup SSMSORT
00765  * \brief key Comparison Function
00766  * @param[in] length1  Length of first key in bytes
00767  * @param[in] key1    Pointer to start of first key
00768  * @param[in] length2 Length of second key in bytes
00769  * @param[in] key2    Pointer to start of second key
00770  * \retval int Negative if key1 < key2, 0 if equal, Positive if key1 > key2
00771  * \details
00772  * Used on every key comparison.  
00773  * \note It is up to the server to determine if the keys are properly
00774  * aligned for comparison as fundamental types. Alignment is determined by
00775  * the combined behavior of the CSKF and MOF callbacks, as well as of
00776  * the location of the keys in the original records.
00777  */
00778 typedef int (*CF) (
00779     w_base_t::uint4_t length1, 
00780     const void *      key1, 
00781     w_base_t::uint4_t length2, 
00782     const void *      key2
00783 );
00784 
00785 /**\typedef LEXFUNC
00786  * \ingroup SSMSORT
00787  * \brief Lexify key function
00788  * @param[in] source    Pointer to start of key
00789  * @param[in] len    Length of key
00790  * @param[out] sink    Pointer to output buffer (preallocated, of length len)
00791  * \details
00792  *
00793  * This type of function is called by the generic_CSKF. It may be useful
00794  * for user-defined CKSF callbacks.  Its purpose is to reformat keys 
00795  * into lexicographic form so that simple string-type (byte-by-byte)
00796  * comparisons yield the same results as typed comparisons would.
00797  * An alternative to lexifying keys and using string comparisons is to
00798  * use typed comparisons, however, that has its disadvantages, particularly
00799  * for keys that might be spread across page boundaries or are not aligned.
00800  */
00801 typedef w_rc_t (*LEXFUNC) (const void *source, smsize_t len, void *sink);
00802 
00803 /**\brief A cookie passed to generic_CSKF callback must point to one of these.
00804  * \ingroup SSMSORT
00805  * \details
00806  * This struct may be used by user-defined CSKF callback functions.
00807  * For example:
00808     \code
00809     w_rc_t 
00810     example_stringCSKF(
00811         const rid_t&        ,  // not used
00812         const object_t&     obj_in,
00813         key_cookie_t        cookie,  // generic_CSKF_cookie
00814         factory_t&          , // not used
00815         skey_t*             out
00816     )
00817     {
00818         // Cast the cookie to a pointer to a generic_CSKF_cookie.
00819         generic_CSKF_cookie&  K = *(generic_CSKF_cookie*)(cookie.make_ptr());
00820         bool is_in_header = false;
00821 
00822         // Populate the output : point to key in body of record/object 
00823         new(out) skey_t(obj_in, K.offset, K.length, is_in_header);
00824         return RCOK;
00825     }
00826     \endcode
00827  */
00828 struct generic_CSKF_cookie 
00829 {
00830     /// value == noLEXFUNC means don't lexify
00831     LEXFUNC    func; 
00832     /// True if the key is in the record header, false if in body.
00833     bool       in_hdr;
00834     /// Offset from start of record header or body.
00835     smsize_t   offset;
00836     /// Key length.
00837     smsize_t   length;
00838 
00839     void clear() { length = 0; offset = 0; in_hdr = false; func = NULL ; }
00840 };
00841 
00842 /**\brief Parameter to control behavior of sort_file. 
00843  * \ingroup SSMSORT
00844  * \details
00845  * This class determines how the sort will behave, and it holds descriptions
00846  * of the keys to be used for a sort.  
00847  * For details of the effect on a sort, see \ref SORTOUTPUT.
00848  * For details about sort keys, see \ref SORTKEYS.
00849  *
00850  * He who creates this data structure determines
00851  * what is "most significant" key, next-most significant key, etc.
00852  * Key 0 gets compared first, 1 next, and so on.
00853  *
00854  * For related types, see the ssm_sort namespace.
00855  */
00856 class sort_keys_t 
00857 {
00858 public:
00859     typedef ssm_sort::LEXFUNC LEXFUNC;
00860  
00861     /**\brief Lexify callback function that does a simple memory copy 
00862      * @param[in] source    Pointer to start of key
00863      * @param[in] len    Length of key
00864      * @param[out] sink    Pointer to output buffer
00865      * 
00866      * \details
00867      * Does no reformatting; simply copies from source to sink.
00868      */
00869     static w_rc_t noLEXFUNC (const void *source, smsize_t len, void *sink);
00870 
00871     /**\brief Vacuous callback function, does nothing.
00872      * @param[in] rid   Ignored.
00873      * @param[in] obj  Ignored.
00874      * @param[in] cookie  Ignored.
00875      * @param[in] f  Ignored.
00876      * @param[out] out   Ignored.
00877      * \details
00878      * This function should never be used.  It is a default value.
00879      * The sort_file checks for
00880      * \code sort_keys_t::lexify() == sort_keys_t::noCSKF 
00881      * \endcode
00882      * and if so, bypasses any code connected with key creation, 
00883      * using the object_t it would have passed in to this function
00884      * as if it were the output of this function.
00885      * This comparison and bypass is faster than executing the
00886      * prologue and epilogue code to acquire space and release it,
00887      * needed when a CSKF is called.
00888      */
00889     static w_rc_t noCSKF(
00890         const rid_t&       rid,
00891         const object_t&    obj,
00892         key_cookie_t       cookie,  // type info
00893         factory_t&         f,
00894         skey_t*            out
00895     );
00896 
00897     /**\brief Either copies or lexifies a key.
00898      * @param[in] rid   Record ID of the record containing the key.
00899      * @param[in] in_obj  This refers to the record containing the key.
00900      * @param[in] cookie  Must be a pointer to a generic_CSKF_cookie,
00901      *                 which tells it which \ref LEXFUNC function to call 
00902      *                 (noLEXFUNC indicates straight copy),
00903      *                 and also tells it the length and location (offset) 
00904      *                 of the key.
00905      * @param[in] f    A heap manager for allocating space.
00906      * @param[out] out   Result is written here.
00907      *
00908      * \details
00909      * One normally expects the user to provide the entire
00910      * function for this, but we have this generic version just
00911      * for simplifying the handling of basic types for backward
00912      * compatibility.
00913      */
00914     static w_rc_t generic_CSKF(
00915         const rid_t&     rid,
00916         const object_t&  in_obj,
00917         key_cookie_t     cookie,  // type info
00918         factory_t&       f,
00919         skey_t*          out
00920     );
00921 
00922     /**\brief Vacuous Marshal Object Function
00923      * \details
00924      * This function is never called; rather, the
00925      * sort code checks for 
00926      * \code sort_keys_t::marshal_func() ==sort_keys_t::noMOF 
00927      * \endcode and if so, bypasses any code specific to
00928      * marshalling. This is done because the preparatory work
00929      * for calling a marshal function includes allocating space for
00930      * the results, and it is cheaper to bypass it altogether.
00931      */
00932     static w_rc_t noMOF (
00933         const rid_t&     ,  
00934         const object_t&  ,
00935         key_cookie_t     ,  
00936         object_t*    
00937     );
00938 
00939     /**\brief Vacuous Unmarshal Object Function
00940      * \details
00941      * This function is never called; rather, the
00942      * sort code checks for 
00943      * \code sort_keys_t::unmarshal_func() == sort_keys_t::noMOF 
00944      * \endcode and if so, bypasses any code specific to
00945      * unmarshalling. This is done because the preparatory work
00946      * for calling an unmarshal function includes allocating space for
00947      * the results, and it is cheaper to bypass it altogether.
00948      */
00949     static w_rc_t noUMOF (
00950         const rid_t&     ,  
00951         const object_t&  ,
00952         key_cookie_t     ,  
00953         object_t*    
00954     );
00955 
00956     /*
00957      * Default comparision functions for in-buffer-pool
00958      * comparisons.  No byte-swapping is done; alignment
00959      * requirements must already be met before calling these.
00960      */
00961     static int string_cmp(w_base_t::uint4_t , const void* , 
00962             w_base_t::uint4_t , const void*);
00963     static int uint8_cmp(w_base_t::uint4_t , const void* , 
00964             w_base_t::uint4_t , const void* );
00965     static int int8_cmp(w_base_t::uint4_t , const void* , 
00966             w_base_t::uint4_t , const void* );
00967     static int uint4_cmp(w_base_t::uint4_t , const void* , 
00968             w_base_t::uint4_t , const void* );
00969     static int int4_cmp(w_base_t::uint4_t , const void* , 
00970             w_base_t::uint4_t , const void* );
00971     static int uint2_cmp(w_base_t::uint4_t , const void* , 
00972             w_base_t::uint4_t , const void* );
00973     static int int2_cmp(w_base_t::uint4_t , const void* , 
00974             w_base_t::uint4_t , const void* );
00975     static int uint1_cmp(w_base_t::uint4_t , const void* , 
00976             w_base_t::uint4_t , const void* );
00977     static int int1_cmp(w_base_t::uint4_t , const void* , 
00978             w_base_t::uint4_t , const void* );
00979     static int f4_cmp(w_base_t::uint4_t , const void* , 
00980             w_base_t::uint4_t , const void* );
00981     static int f8_cmp(w_base_t::uint4_t , const void* , 
00982             w_base_t::uint4_t , const void* );
00983 
00984 public:
00985     //@{
00986     /** @name LEXFUNCs
00987      * LEXFUNC (q.v.) functions for fundamental types.
00988      */
00989     static w_rc_t f8_lex(const void *source, smsize_t len, void *sink);
00990     static w_rc_t f4_lex(const void *source, smsize_t len, void *sink);
00991     static w_rc_t u8_lex(const void *source, smsize_t len, void *sink);
00992     static w_rc_t i8_lex(const void *source, smsize_t len, void *sink);
00993     static w_rc_t u4_lex(const void *source, smsize_t len, void *sink);
00994     static w_rc_t i4_lex(const void *source, smsize_t len, void *sink);
00995     static w_rc_t u2_lex(const void *source, smsize_t len, void *sink);
00996     static w_rc_t i2_lex(const void *source, smsize_t len, void *sink);
00997     static w_rc_t u1_lex(const void *source, smsize_t len, void *sink);
00998     static w_rc_t i1_lex(const void *source, smsize_t len, void *sink);
00999     //@}
01000 
01001 
01002 private:
01003 
01004      /* Metadata about a key -- info about the key
01005      *     as it appears in all objects, not per-object
01006      *      key info.
01007      */
01008     class key_meta_t {
01009         public:
01010         /* 
01011          * callbacks:
01012          */ 
01013         CF              _cb_keycmp;  // comparison
01014         CSKF            _cb_keyinfo; // get location/copy/derived
01015         key_cookie_t    _cookie;
01016         w_base_t::int1_t _mask;
01017         fill1            _dummy1;
01018         fill2            _dummy2;
01019         key_meta_t() : _cb_keycmp(0), _cb_keyinfo(0), 
01020             _cookie(0),
01021             _mask(t_none) {}
01022         key_meta_t(const key_meta_t &old) : 
01023             _cb_keycmp(old._cb_keycmp), 
01024             _cb_keyinfo(old._cb_keyinfo),
01025             _cookie(old._cookie),
01026             _mask(old._mask) {}
01027     };
01028     int        _nkeys;        // constructor 
01029     int        _spaces;    // _grow
01030     key_meta_t*     _meta;
01031     key_location_t* _locs;
01032     bool    _stable;    // set_stable, is_stable
01033     bool    _for_index;    // set_for_index, is_for_index
01034     bool    _remove_dups;    // set_unique
01035     bool    _remove_dup_nulls; // set_null_unique
01036     bool    _ascending;    // ascending, set_ascending
01037     bool    _deep_copy;    // set_for_index, set_for_file
01038     bool    _keep_orig_file;// set_for_index, set_for_file, set_keep_orig
01039     bool    _carry_obj;    // set_for_file, carry_obj, set_carry_obj
01040 
01041     CSKF    _cb_lexify_index_key;    // set_for_index, index key lexify
01042     key_cookie_t _cb_lexify_index_key_cookie;    // set_for_index
01043 
01044     MOF        _cb_marshal;    // set_object_marshal
01045     UMOF    _cb_unmarshal;    // set_object_marshal
01046     key_cookie_t _cb_marshal_cookie;    // set_object_marshal
01047 
01048     void _grow(int i);
01049 
01050     void _prep(int key);
01051     void _set_loc(int key, smsize_t off, smsize_t len) {
01052         key_location_t &t = _locs[key];
01053         t._off = off;
01054         t._length = len;
01055     }
01056     void _set_mask(int key,
01057         bool fixed, 
01058         bool aligned, 
01059         bool lexico, 
01060         bool in_header,
01061         CSKF gfunc,
01062         CF   cfunc,
01063         key_cookie_t cookie
01064     ) {
01065         key_meta_t &t = _meta[key];
01066         if(aligned) t._mask |= t_aligned;
01067         else t._mask &= ~t_aligned;
01068         if(fixed) t._mask |= t_fixed;
01069         else t._mask &= ~t_fixed;
01070         if(lexico) t._mask |= t_lexico;
01071         else t._mask &= ~t_lexico;
01072         if(in_header) t._mask |= sort_keys_t::t_hdr;
01073         else t._mask &= ~sort_keys_t::t_hdr;
01074         t._cb_keycmp = cfunc;
01075         t._cb_keyinfo = gfunc;
01076         w_assert3(cfunc);
01077         if( ! fixed) {
01078             w_assert3(gfunc);
01079         }
01080         t._cookie = cookie;
01081     }
01082 
01083     void _copy(const sort_keys_t &old);
01084 
01085     /*
01086      * mask:
01087      * t_fixed: key location is fixed for all records: no need
01088      *    to call CSKF for each record 
01089      * t_aligned: key location adequately aligned (relative
01090      *    to the beginning of the record, which is 4-byte aligned) for
01091      *    in-buffer-pool comparisons with the given comparison
01092      *    function.  Copy to a contiguous buffer is unnecessary iff
01093      *    the entire key happens to be on one page (which is always
01094      *    the case with small records). 
01095      * t_lexico: Key is already in lexicographic order and
01096      *    can be spread across pages, and the comparison 
01097      *    function (usually string-compare or byte-compare) 
01098      *    can be called on successive segments of the key -- 
01099      *    copying they key to a contiguous buffer is unnecessary.
01100      *    Implies key is adequately aligned, but does not imply t_fixed.
01101      *
01102      *  t_hdr: key is at offset in hdr rather than in body 
01103      */
01104 
01105     enum mask { t_none = 0x0, 
01106         t_fixed = 0x1, 
01107         t_aligned =0x2, 
01108         t_lexico=0x4,
01109         t_hdr = 0x40  // key found in hdr
01110         };
01111 
01112 public:
01113     /// Create a structure that's ready to be populated with \a nkeys keys.
01114     NORET sort_keys_t(int nkeys);
01115 
01116     NORET ~sort_keys_t() {
01117         delete[] _meta;
01118         delete[] _locs;
01119     }
01120 
01121     /// Copy operator.
01122     NORET sort_keys_t(const sort_keys_t &old);
01123 
01124     /// Return number of keys.
01125     int     nkeys() const { return _nkeys; }
01126 
01127     /**\brief Output is to be suitable for index bulk-load, index key is sort
01128      * key.
01129      *
01130      * Call this if you want the output file to be written with objects 
01131      * of the form 
01132      * * hdr == key, body==rid
01133      * and the input file not to be destroyed.
01134      * This file is suitable for bulk-loading an index, and
01135      * the index key is the sort key.
01136      *
01137      * You \b must provide conversion functions for the sort key
01138      * to be converted to a lexicographic format string if it is not
01139      * already in such format in the original record, if the
01140      * index key (being the sort key) is to be used in a B+-Tree.
01141      *
01142      * Only one sort key is supported when sorting for index bulk-load, but
01143      * the key may be derived, and so the CSKF callback can combine
01144      * multiple keys, and lexifying them ensures that they can be
01145      * sorted as one.  This is not entirely sufficient to cover all
01146      * cases of multiple keys, but it will do for many cases, particularly
01147      * where the sub-keys are of fixed length.
01148      */
01149     int  set_for_index() {
01150             _keep_orig_file = true;
01151             _deep_copy = false;
01152             _for_index = true; 
01153             if(_nkeys > 1) {
01154                 return 1; // BAD 
01155                 // we only support single-key btrees
01156             }
01157             _cb_lexify_index_key = sort_keys_t::noCSKF;
01158             _cb_lexify_index_key_cookie = key_cookie_t(0);
01159             return 0;
01160         }
01161 
01162     /**\brief Output is to be suitable for index bulk-load, index key is
01163      * different from the sort key.
01164      * @param[in] lfunc Key creation/location function for the index key.
01165      * @param[in] ck Datum for \a lfunc
01166      * \details
01167      *
01168      * Only one sort key can be used.
01169      *
01170      * Call this if you want the output file
01171      * to be written with objects of the form 
01172      * hdr == key, body==rid
01173      * and the input file not to be destroyed,
01174      * and you wish the index key to be different from the sort key.
01175      *
01176      * The \a lfunc argument must produce an index key 
01177      * in \ref LEXICOFORMAT "lexicographic format" if the index is to
01178      * be a B+-Tree.  This function is called when the record is
01179      * first encountered (reading the input file), since the record is
01180      * already pinned to gather a sort key.
01181      *
01182      * Only one sort key is supported when bulk-loading for indexes, but
01183      * the key may be derived, and so the CSKF callback can combine
01184      * multiple keys, and lexifying them ensures that they can be
01185      * sorted as one.  This is not entirely sufficient to cover all
01186      * cases of multiple keys, but it will do for many cases, particularly
01187      * where the sub-keys are of fixed length.
01188      */
01189     int   set_for_index(CSKF lfunc, key_cookie_t ck) {
01190             set_for_index();
01191             if(!lfunc) lfunc = sort_keys_t::noCSKF;
01192             _cb_lexify_index_key = lfunc;
01193             _cb_lexify_index_key_cookie = ck;
01194             return 0;
01195         }
01196 
01197     /// Return true if set_for_index() was called.
01198     bool    is_for_index() const { return _for_index; }
01199 
01200     /// Return true if set_for_file() was called.
01201     bool    is_for_file() const { return !_for_index; }
01202 
01203     /// Return true if set_stable() 
01204     bool    is_stable() const { return _stable; }
01205 
01206     /// Ensure stable sort.  Cannot be used with set_for_index.
01207     void    set_stable(bool val) { _stable = val; }
01208 
01209     /// Return the function that creates or locates and/or lexifies the key.
01210     CSKF    lexify_index_key() const { return _cb_lexify_index_key; }
01211 
01212     /// Pointer to input datum for the associated lexify() CSKF. 
01213     key_cookie_t     lexify_index_key_cookie() const { return _cb_lexify_index_key_cookie; }
01214 
01215     /**\brief Output is to be a the input records, sorted 
01216      * @param[in] deepcopy Use true if you want a deep copy.
01217      * @param[in] keeporig Use true if you want to retain the input file.
01218      * @param[in] carry_obj Use true if you want to carry along the entire
01219      *                      objects through the scratch files and to the
01220      *                      output file. Used only for is_for_file().
01221      *
01222      * Call this if you want the output file
01223      * to contain copies of the input file records, undulterated,
01224      * but in sorted order.
01225      *
01226      * Multiple keys are supported.  Use of a CSKF is not needed if
01227      * the keys are embedded in the records, suitably aligned, and do
01228      * not cross page boundaries (string comparisons excepted, of course,
01229      * as string-comparison methods can be called repeatedly on successive
01230      * corresponding portions of string keys).
01231      */
01232     int     set_for_file(bool deepcopy, bool keeporig, bool carry_obj) {
01233             _for_index = false;
01234             (void) set_carry_obj(carry_obj);
01235             (void) set_deep_copy(deepcopy);
01236             return set_keep_orig(keeporig);
01237         }
01238 
01239     /**\brief Ensure marshaling and unmarshaling of the objects.
01240      * @param[in] marshal MOF to be used when reading records from disk.
01241      * @param[in] unmarshal UMOF to be used to write records to disk,
01242      * @param[in] c Arguemtn to \a marshal and \a unmarshal
01243      *
01244      * Call this if the objects in the file need to be byte-swapped
01245      * or otherwise marshaled before use, and if they need to be
01246      * unmarshaled before the output file is written.
01247      * This may be used with set_for_index or set_for_file.
01248      */
01249     int     set_object_marshal( MOF marshal, UMOF unmarshal, key_cookie_t c) {
01250             _cb_marshal = marshal;
01251             _cb_unmarshal = unmarshal;
01252             _cb_marshal_cookie = c;
01253             return 0;
01254         }
01255 
01256     /**\brief Return the marshal function or noMOF if none was given */
01257     MOF     marshal_func() const { return _cb_marshal; }
01258     /**\brief Return the unmarshal function or noUMOF if none was given */
01259     UMOF    unmarshal_func() const { return _cb_unmarshal; }
01260 
01261     /**\brief Pointer to datum for marshal and unmarshal functions */
01262     key_cookie_t    marshal_cookie() const { return _cb_marshal_cookie; }
01263 
01264     /**\brief True if sort will be in ascending order */
01265     bool    is_ascending() const { return _ascending; } 
01266 
01267     /**\brief Ensure that sort will be in ascending order */
01268     int     set_ascending( bool value = true) {  _ascending = value; return 0; }
01269 
01270     /**\brief True if duplicate keys and their records will be removed.
01271      *
01272      * When duplicates are encountered, they are sorted by record-id,
01273      * and the larger of the two (per umemcmp)  is removed.
01274      * */
01275     bool    is_unique() const { return _remove_dups; } 
01276 
01277     /**\brief Ensure that duplicate keys and their records will be removed 
01278      * */
01279     int     set_unique( bool value = true) {  _remove_dups = value; return 0; }
01280 
01281     /**\brief True if duplicate null keys and their records will be removed 
01282      * 
01283      * When duplicates are encountered, they are sorted by record-id,
01284      * and the larger of the two (per umemcmp) is removed.
01285      * */
01286     bool    null_unique() const { return _remove_dup_nulls; } 
01287 
01288     /**\brief Ensure that duplicate null keys and their records will be removed
01289      */
01290     int     set_null_unique( bool value = true) {  _remove_dup_nulls = value; 
01291             return 0; }
01292 
01293     /**\brief True if the sort will copy the entire objects through each
01294      * phase.
01295      * \details
01296      * Used when is_for_file only.
01297      */
01298     bool    carry_obj() const {  return _carry_obj; }
01299 
01300     /**\brief Control whether the sort will copy the entire objects through each
01301      * phase.
01302      * @param[in] value   If true, ensure keep_orig().
01303      * \details
01304      * Used when is_for_file only.
01305      * This is useful if the keys are fixed and consume most of the
01306      * original objects, in which case there is no need for the
01307      * sort code to duplicate the key as well as the object 
01308      * in the temporary output files, or to re-pin the original
01309      * records to copy them to the output file.
01310      */
01311     int     set_carry_obj(bool value = true) {  
01312         return _carry_obj = value; 
01313         return 0;
01314         }
01315 
01316     /**\brief True if the sort will copy the entire objects to the
01317      * result file.
01318      */
01319     bool    deep_copy() const {  return _deep_copy; }
01320     /**\brief Control whether the sort will copy the entire objects to the
01321      * result file.
01322      * @param[in] value   If true, ensure deep_copy().
01323      * \details
01324      * Used when is_for_file only.
01325      * When large objects appear in the input file and the input (original)
01326      * file is not to be kept, sort can copy only the metadata for the
01327      * large objects and reassign the large-object store to the result file.
01328      * This eliminates a lot of object creation and logging.
01329      */
01330     int     set_deep_copy(bool value = true ) {  
01331         _deep_copy = value; 
01332         return 0; 
01333         }
01334 
01335     /**\brief Return true if the sort will not destroy the input file.
01336      * \details
01337      * Used when is_for_file only.
01338      * This is turned on automatically when set_for_index.
01339      */
01340     bool    keep_orig() const {  return _keep_orig_file; }
01341     /**\brief Control whether the sort will not destroy the input file.
01342      * @param[in] value   If true, ensure keep_orig().
01343      * \details
01344      * Used when is_for_file only.
01345      * This is turned on automatically when set_for_index.
01346      */
01347     int     set_keep_orig(bool value = true ) {  
01348         if(value && !_deep_copy) return 1;
01349         _keep_orig_file = value;
01350         return 0;
01351         }
01352 
01353     /**\brief Set attributes of key.
01354      * @param[in] keyindex The ordinal number of the index whose 
01355      *                      attributes are to be set
01356      * @param[in] off Offset from beginning 
01357      *               of record header or body where the key is to be found
01358      * @param[in] len Length of key in recordd
01359      * @param[in] in_header True indicates that the key is 
01360      *                 to be found in the record header rather than in the body.
01361      * @param[in] aligned True indicates that the key, as found in the record,
01362      *                  is suitably aligned for key comparisons with the
01363      *                  CF (key-comparison function) to be used. False means
01364      *                  that sort has to make an aligned copy before
01365      *                  doing a key comparison.
01366      * @param[in] lexico True indicates that the key, as found in the record,
01367      *                  is already in \ref LEXICOFORMAT "lexicographic format"
01368      * @param[in] cfunc Key comparison function to use on this key.
01369      *
01370      * \retval 0 if OK, 1 if error.
01371      * \details
01372      * You must call this or set_sortkey_derived for each of the keys.
01373      *
01374      * There must be implicit agreement between what the \a cfunc
01375      * expects and the arguments \a aligned and \a lexico.
01376      */
01377     int set_sortkey_fixed(
01378         int keyindex,  // Key number origin 0
01379         smsize_t off,  // offset from beginning of hdr or body
01380         smsize_t len,  // length
01381         bool in_header,// header or body
01382         bool aligned,  // is aligned
01383         bool lexico,   // needs lexifying
01384         CF   cfunc     // comparison func
01385         );
01386 
01387     /**\brief Set attributes of key.
01388      * @param[in] keyindex The ordinal number of the index whose 
01389      *                      attributes are to be set
01390      * @param[in] gfunc Key-creation (lexify) function.
01391      * @param[in] cookie Datum for \a gfunc.
01392      * @param[in] in_header True indicates that the key is 
01393      *                 to be found in the record header rather than in the body.
01394      * @param[in] aligned True indicates that the key, as found in the 
01395      *                  result of \a gfunc,
01396      *                  is suitably aligned for key comparisons with the
01397      *                  CF (key-comparison function) to be used. False means
01398      *                  that sort has to make an aligned copy before
01399      *                  doing a key comparison.
01400      * @param[in] lexico True indicates that the key, as found in the result
01401      *                  of \a gfunc,
01402      *                  is in \ref LEXICOFORMAT "lexicographic format".
01403      * @param[in] cfunc Key comparison function to use on this key.
01404      *
01405      * \retval 0 if OK, 1 if error.
01406      * \details
01407      * You must call this or set_sortkey_fixed for each of the keys.
01408      *
01409      * There must be implicit agreement between what the \a cfunc
01410      * expects and the arguments \a aligned and \a lexico.
01411      */
01412     int set_sortkey_derived(
01413         int keyindex, 
01414         CSKF gfunc,
01415         key_cookie_t cookie,
01416         bool in_header,
01417         bool aligned, 
01418         bool lexico, 
01419         CF   cfunc
01420         );
01421 
01422     /**\brief Return the key-location information for a given fixed-location key.
01423      * @param[in] i The ordinal number of the index of interest.
01424      * \details
01425      * Only for fixed-location keys.
01426      */
01427     key_location_t&  get_location(int i) { return _locs[i]; }
01428 
01429     /**\brief Return the offset for a given fixed-location key.
01430      * @param[in] i The ordinal number of the index of interest.
01431      * \details
01432      * Only for fixed-location keys.
01433      */
01434     smsize_t offset(int i) const {
01435         return _locs[i]._off;
01436     }
01437     /**\brief Return the offset for a given fixed-location key.
01438      * @param[in] i The ordinal number of the index of interest.
01439      * \details
01440      * Only for fixed-location keys.
01441      */
01442     smsize_t length(int i) const {
01443         return _locs[i]._length;
01444     }
01445     /**\brief Return the CSKF function for a given derived key.
01446      * @param[in] i The ordinal number of the index of interest.
01447      * \details
01448      * Only for derived keys.
01449      */
01450     CSKF keycreate(int i) const {
01451         return _meta[i]._cb_keyinfo;
01452     }
01453 
01454     /**\brief Return the argument to the CSKF function for a given derived key.
01455      * @param[in] i The ordinal number of the index of interest.
01456      * \details
01457      * Only for derived keys.
01458      */
01459     key_cookie_t cookie(int i) const {
01460         return _meta[i]._cookie;
01461     }
01462 
01463     /**\brief Return the key-comparison function for a given key.
01464      * @param[in] i The ordinal number of the index of interest.
01465      * \details
01466      * For fixed-location and derived keys.
01467      */
01468     CF keycmp(int i) const {
01469         return _meta[i]._cb_keycmp;
01470     }
01471 
01472     /// Return true if key \a i is in lexicographic format in the input record
01473     bool     is_lexico(int i) const {
01474         return (_meta[i]._mask & t_lexico)?true:false;
01475     }
01476     /// Return true if key \a i is in a fixed location in all input records
01477     bool     is_fixed(int i) const {
01478         return (_meta[i]._mask & t_fixed)?true:false;
01479     }
01480     /// Return true if key \a i is in suitably aligned in the input record for the key-comparison function
01481     bool     is_aligned(int i) const {
01482         return (_meta[i]._mask & t_aligned)?true:false;
01483     }
01484     /// True if the key or the source of a derived key is to be found in the record header
01485     bool     in_hdr(int i) const {
01486         return (_meta[i]._mask & t_hdr)?true:false;
01487     }
01488 };
01489 
01490 } // end namespace ssm_sort
01491 
01492 inline void 
01493 ssm_sort::sort_keys_t::_grow(int i) 
01494 {
01495     // realloc it to accommodate i more entries
01496     {
01497     key_location_t* tmp = new key_location_t[_spaces + i];
01498     if(!tmp) W_FATAL(fcOUTOFMEMORY);
01499     if(_locs) {
01500         memcpy(tmp, _locs, nkeys() * sizeof(key_location_t));
01501         delete[] _locs;
01502     }
01503     _locs = tmp;
01504     }
01505     {
01506     key_meta_t* tmp = new key_meta_t[_spaces + i];
01507     if(!tmp) W_FATAL(fcOUTOFMEMORY);
01508     if(_meta) {
01509         memcpy(tmp, _meta, nkeys() * sizeof(key_meta_t));
01510         delete[] _meta;
01511     }
01512     _meta = tmp;
01513     }
01514     _spaces += i;
01515     // don't change nkeys
01516 }
01517 inline NORET 
01518 ssm_sort::sort_keys_t::sort_keys_t(int nkeys):
01519     _nkeys(0),
01520     _spaces(0),
01521     _meta(0),
01522     _locs(0),
01523     _stable(false), _for_index(false), 
01524     _remove_dups(false), _remove_dup_nulls(false),
01525     _ascending(true), _deep_copy(false), _keep_orig_file(false),
01526     _carry_obj(false),
01527     _cb_lexify_index_key(sort_keys_t::noCSKF),
01528     _cb_lexify_index_key_cookie(0),
01529     _cb_marshal(sort_keys_t::noMOF),
01530     _cb_unmarshal(sort_keys_t::noUMOF),
01531     _cb_marshal_cookie(0)
01532 { 
01533     _grow(nkeys);
01534 }
01535 
01536 inline void 
01537 ssm_sort::sort_keys_t::_prep(int key) 
01538 {
01539     if(key >= _spaces) {
01540     _grow(5);
01541     }
01542     if(key >= _nkeys) {
01543     // NB: hazard if not all of these filled in!
01544     _nkeys = key+1;
01545     }
01546 }
01547 
01548 inline void 
01549 ssm_sort::sort_keys_t::_copy(const sort_keys_t &old) 
01550 {
01551     _prep(old.nkeys()-1);
01552     int i;
01553     for(i=0; i< old.nkeys(); i++) {
01554     _locs[i] = old._locs[i];
01555     _meta[i] = old._meta[i];
01556     }
01557     w_assert3(nkeys() == old.nkeys());
01558     w_assert3(_spaces >= old._spaces);
01559     for(i=0; i< old.nkeys(); i++) {
01560     w_assert3(_meta[i]._mask == old._meta[i]._mask);
01561     }
01562 }
01563 
01564 inline NORET 
01565 ssm_sort::sort_keys_t::sort_keys_t(const sort_keys_t &old) : 
01566     _nkeys(0), _spaces(0), 
01567     _meta(0), _locs(0) 
01568 {
01569     _copy(old);
01570 }
01571 
01572 inline int 
01573 ssm_sort::sort_keys_t::set_sortkey_fixed(
01574     int key, 
01575     smsize_t off, 
01576     smsize_t len,
01577     bool in_header,
01578     bool aligned, 
01579     bool lexico,
01580     CF   cfunc
01581 ) 
01582 {
01583     if(is_for_index() && key > 0) {
01584         return 1;
01585     }
01586     _prep(key);
01587     _set_mask(key,  true, aligned, lexico, 
01588                 in_header, noCSKF, cfunc, key_cookie_t::null);
01589     _set_loc(key,  off, len);
01590     return 0;
01591 }
01592 
01593 inline int 
01594 ssm_sort::sort_keys_t::set_sortkey_derived(
01595     int key, 
01596     CSKF gfunc,
01597     key_cookie_t cookie,
01598     bool in_header,
01599     bool aligned, 
01600     bool lexico, 
01601     CF   cfunc
01602 )
01603 {
01604     if(is_for_index() && key > 0) {
01605         return 1;
01606     }
01607     _prep(key);
01608     _set_mask(key, false, aligned, lexico, 
01609         in_header,
01610         gfunc, cfunc, 
01611         cookie);
01612     return 0;
01613 }
01614 
01615 /*<std-footer incl-file-exclusion='SORT_S_H'>  -- do not edit anything below this line -- */
01616 
01617 #endif          /*</std-footer>*/

Generated on Wed Jul 7 17:22:32 2010 for Shore Storage Manager by  doxygen 1.4.7