sm.h

00001 /* -*- mode:C++; c-basic-offset:4 -*-
00002      Shore-MT -- Multi-threaded port of the SHORE storage manager
00003    
00004                        Copyright (c) 2007-2009
00005       Data Intensive Applications and Systems Labaratory (DIAS)
00006                Ecole Polytechnique Federale de Lausanne
00007    
00008                          All Rights Reserved.
00009    
00010    Permission to use, copy, modify and distribute this software and
00011    its documentation is hereby granted, provided that both the
00012    copyright notice and this permission notice appear in all copies of
00013    the software, derivative works or modified versions, and any
00014    portions thereof, and that both notices appear in supporting
00015    documentation.
00016    
00017    This code is distributed in the hope that it will be useful, but
00018    WITHOUT ANY WARRANTY; without even the implied warranty of
00019    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS
00020    DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
00021    RESULTING FROM THE USE OF THIS SOFTWARE.
00022 */
00023 
00024 /*<std-header orig-src='shore' incl-file-exclusion='SM_H'>
00025 
00026  $Id: sm.h,v 1.314 2010/07/07 20:50:24 nhall Exp $
00027 
00028 SHORE -- Scalable Heterogeneous Object REpository
00029 
00030 Copyright (c) 1994-99 Computer Sciences Department, University of
00031                       Wisconsin -- Madison
00032 All Rights Reserved.
00033 
00034 Permission to use, copy, modify and distribute this software and its
00035 documentation is hereby granted, provided that both the copyright
00036 notice and this permission notice appear in all copies of the
00037 software, derivative works or modified versions, and any portions
00038 thereof, and that both notices appear in supporting documentation.
00039 
00040 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY
00041 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS
00042 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND
00043 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
00044 
00045 This software was developed with support by the Advanced Research
00046 Project Agency, ARPA order number 018 (formerly 8230), monitored by
00047 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518.
00048 Further funding for this work was provided by DARPA through
00049 Rome Research Laboratory Contract No. F30602-97-2-0247.
00050 
00051 */
00052 
00053 #ifndef SM_H
00054 #define SM_H
00055 
00056 #include "w_defines.h"
00057 
00058 /*  -- do not edit anything above this line --   </std-header>*/
00059 
00060 /*
00061  *  Stuff needed by value-added servers.  NOT meant to be included by
00062  *  internal SM .c files, except to the extent that they need these
00063  *  definitions used in the API.
00064  */
00065 
00066 #ifdef __GNUG__
00067 #pragma interface
00068 #endif
00069 
00070 #ifndef SM_INT_4_H
00071 #include <sm_int_4.h>
00072 #endif
00073 
00074 #ifndef SM_DU_STATS_H
00075 #include <sm_du_stats.h> // declares sm_du_stats_t
00076 #endif
00077 
00078 #ifndef SM_STATS_H
00079 #include <smstats.h> // declares sm_stats_info_t and sm_config_info_t
00080 #endif
00081 
00082 #ifndef SM_S_H
00083 #include <sm_s.h> // declares key_type_s, rid_t, lsn_t
00084 #endif
00085 
00086 #ifndef LEXIFY_H
00087 #include <lexify.h> // declares sortorder with constants
00088 #endif
00089 
00090 #ifndef NBOX_H
00091 #include <nbox.h>   // key_info_t contains nbox_t
00092 #endif /* NBOX_H */
00093 
00094 #ifndef SORT_S_H
00095 #include <sort_s.h> // declares key_info_t
00096 #endif
00097 
00098 /* DOXYGEN Documentation : */
00099 
00100 /**\addtogroup LOGSPACE 
00101  *
00102  * Updates performed by transactions are logged so that
00103  * the can be rolled back (in the event of a transaction abort)
00104  * or restored (in the event of a crash).  Both the old and new values
00105  * of an updated location are logged.  This allows a steal, no-force
00106  * buffer management policy, which means the buffer manager is free
00107  * to write dirty pages to disk at any time and yet does not have
00108  * to write dirty pages for a a transaction to commit.
00109  *
00110  * The log is stored in a set of Unix files, all in the same directory,
00111  * whose path is determined by a run-time option.
00112  * The maximum size of the log is also determined by a run-time option.o
00113  * The proper value of the log size depends on
00114  * the expected transaction mix.  More specifically, it depends on the
00115  * age of the oldest (longest running) transaction in the system and
00116  * the amount of log space used by all active transactions. Here are
00117  * some general rules to determine the  amount  of  free  log  space
00118  * available in the system.
00119  * - Log records between the first log
00120  *   record generated by the oldest active transaction and the most
00121  *   recent log record generated by any transaction cannot be thrown
00122  *   away.
00123  * - Log records from a transaction are no longer needed
00124  *   once the transaction has committed or completely aborted and all
00125  *   updates have made it to disk. Aborting a transaction causes log space
00126  *   to be used, so space is reserved for aborting each transaction.
00127  *   Enough log space must be available to commit or abort all active
00128  *   transactions at all times.
00129  * 
00130  * - Only space starting at the beginning of the log can be reused.  
00131  *   This space can be reused if it contains log records only for 
00132  *   transactions meeting the previous rule.
00133  *
00134  * -  All storage manager calls that update records require log space twice
00135  *    the size of the space updated in the record. All calls that create,
00136  *    append, or truncate records require log space equal to the size
00137  *    created, inserted, or deleted. Log records generated by these calls
00138  *    (generally one per call) have an overhead of approximately 50 bytes.
00139  *
00140  * - The amount of log space reserved for aborting a transaction is equal to 
00141  *   the amount of log space generated by the transaction plus a fudge 
00142  *   factor. 
00143  *   (Where btrees are concerned, a structure modification
00144  *   might be necessary on abort, using more space on abort, or might not be
00145  *   necessary on abort where it was done during forward processing, 
00146  *   using less space on abort.)
00147  *
00148  * - The transaction assumes responsiblity for reserving space in the
00149  *   log so that it can abort, should it need to (without leaving an
00150  *   unrecoverable volume).  The transaction and the log cooperate to
00151  *   reserve space for the transaction's aborting.
00152  *
00153  * - When insufficient log space is available for a transaction, the 
00154  *   transaction is (may be, depending on the server) aborted.
00155  *   The storage manager will return an error indication (out of log space)
00156  *   if it is unable to insert a log record into the log due to
00157  *   insufficient space.
00158  *
00159  *   \bug GNATS 142 There remain a number of places in the storage manager code
00160  *   that react to a lack of log space with a fatal error; this is a
00161  *   hold-over from the original storage manager, before any attempt to
00162  *   reserve space was in place.  This code has to be rewritten to handle more
00163  *   gracefully such errors.  In order for this to be done, the
00164  *   multi-threaded transaction support will be deprecated.
00165  *
00166  * Checkpoints are taken periodically by the storage manager in order to 
00167  * free log space and shorten recovery time.  Checkpoints are "fuzzy" 
00168  * and can do not require the system to pause while they are completing.
00169  *
00170  * See the storage manager constructor ss_m::ss_m for more information
00171  * about handling out-of-logspace conditions.
00172  *
00173  */
00174 
00175 /**\addtogroup SSMOPT
00176  *
00177  * These are the run-time options for the storage manager.
00178  *
00179  * -sm_bufpoolsize : 
00180  *      - type: number
00181  *      - description: This is the size of 
00182  *      the buffer pool in Kb.  Must be large enough to hold at least 32 pages,
00183  *      so it depends on the configured page size.
00184  *      - default: none
00185  *      - required?: yes
00186  *
00187  * -sm_hugetlbfs_path
00188  *      - type: string (full absolute path name)
00189  *      - description: Needed only if you configured --with-hugetlbfs.
00190  *      - default: see \ref CONFIGOPT
00191  *      - required?: no
00192  *
00193  * -sm_reformat_log
00194  *      - type: Boolean
00195  *      - description: If "yes", your log will be clobbered and the storage
00196  *      manager will start up with an entirely new log.
00197  *      - default: no
00198  *      - required?: no
00199  *
00200  * -sm_logdir
00201  *      - type: string (relative or absolutee path name)
00202  *      - description: Location of the log files.
00203  *      - default: none
00204  *      - required?: yes
00205  *
00206  * -sm_logbufsize
00207  *      - type: number
00208  *      - description: size of log buffer in KB.
00209  *      Must be greater than or equal to the larger of
00210  *      (4 times the page size, 64 Kb)
00211  *      and less than or equal to
00212  *      128 times the page_size. This is the size of 
00213  *      the log buffer in Kb.
00214  *      - default: 128
00215  *      - required?: no
00216  *
00217  * -sm_logsize
00218  *      - type: number
00219  *      - description: greater than or equal to 8256 
00220  *      This is the maximum size of the log in Kb.  It is a function of
00221  *      the log buffer size, and  the default is the minimum allowable for
00222  *      the default sm_logbufsize.
00223  *      - default: 128
00224  *      - required?: yes
00225  *
00226  * -sm_log_warn
00227  *      - type: number between 0 and 100 (percentage)
00228  *      - description: percentage of log that, when consumed by active
00229  *      transactions, triggers a callback warning of potential inability
00230  *      to roll back.   Should be less than 50.
00231  *      - default: 45
00232  *      - required?: no
00233  *
00234  * -sm_errlog
00235  *      - type: string (relative or absolute path name OR - )
00236  *      - description: Destination for error messages.  If "-" is given,
00237  *      the destination is stderr.
00238  *      - default: \b -
00239  *      - required?: no
00240  *
00241  * -sm_errlog_level
00242  *      - type: string  (one of none|emerg|fatal|internal|error|warning|info|debug)
00243  *      - description: filter.  Message of this priority or higher are issued to
00244  *      the error log; messages with lower priority are not issued.
00245  *      The priorities are listed from high to low. "none" means no logging
00246  *      will happen.
00247  *      - default: error
00248  *      - required?: no
00249  *
00250  * -sm_locktablesize : 
00251  *      - type: number greater than or equal to 64
00252  *      - description: size of lock manager's hash table will be a prime
00253  *      number near and greater than the given number.
00254  *      - default: 64000 (yields a hash table with 65521 buckets)
00255  *      - required?: no
00256  *
00257  * -sm_lock_escalate_to_page_threshold
00258  *      - type: number greater than or equal to 0
00259  *      - description: after acquiring this many record locks on a page, the lock
00260  *      will be escalated to a page lock. A value of 0 disables escalation to a
00261  *      page lock.
00262  *      - default: 5
00263  *      - required?: no
00264  *
00265  * -sm_lock_escalate_to_store_threshold
00266  *      - type: number greater than or equal to 0
00267  *      - description: after acquiring this many page locks on in a store, 
00268  *      the lock will be escalated to a store lock. 
00269  *      A value of 0 disables escalation to a store lock.
00270  *      - default: 25
00271  *      - required?: no
00272  *      
00273  * -sm_lock_escalate_to_volume_threshold
00274  *      - type: number greater than or equal to 0
00275  *      - description: after acquiring this many store locks on in a volume, 
00276  *      the lock will be escalated to a volume lock. 
00277  *      A value of 0 disables escalation to a volume lock.
00278  *      - default: 0
00279  *      - required?: no
00280  *
00281  * -sm_cc_alg
00282  *      - type: string (one of file | page | record | none)
00283  *      - description: default locking granularity for file operations.
00284  *      This can be overridden on a per-transaction basis with
00285  *      ss_m::set_xct_lock_level().
00286  *      - default: record
00287  *      - required?: no
00288  *
00289  * -sm_backgroundflush
00290  *      - type: Boolean
00291  *      - description: Enables background-flushing of volumes.
00292  *      Must be set to "yes" for sm_num_page_writers to have any effect.
00293  *      - default: yes
00294  *      - required?: no
00295  *
00296  * -sm_num_page_writers
00297  *      - type: number
00298  *      - description: greater than or equal to 0; this is the number of
00299  *      background-flushing threads for each volume. If you have 
00300  *      lots of threads, 
00301  *      a huge buffer pool, and few volumes, you should increase this.
00302  *      If sm_backgroundflush is "no", this value is ignored.
00303  *      - default: 2
00304  *      - required?: no
00305  *
00306  * -sm_prefetch
00307  *      - type: Boolean
00308  *      - description: Enables prefetching for scans.
00309  *      - default: no
00310  *      - required?: no
00311  *
00312  * -sm_logging
00313  *      - type: Boolean
00314  *      - description: Allows you to turn off logging for a run of
00315  *      the storage manager. This is only for experimentation, to
00316  *      measure logging overhead in a limited way.
00317  *      Aborts, rollbacks and restart/recovery 
00318  *      do not work without logging.   Independent concurrent
00319  *      transactions using btrees might not work without logging (this is
00320  *      not well-tested).
00321  *      Each time you start the server, you had better start with a
00322  *      clean device or a device that resulted from a clean shutdown
00323  *      of the prior run.
00324  *      - default: yes
00325  *      - required?: no
00326  *
00327  * -sm_lock_caching
00328  *      - type: Boolean
00329  *      - description: Enables caching of transaction locks in transaction.
00330  *      Can be turned off for experimentation. If no, the default is not
00331  *      to cache locks, but any transaction can turn on caching for itself
00332  *      by calling the ss_m method  set_lock_cache_enable(bool enable).
00333  *      - default: yes
00334  *      - required?: no
00335  *
00336  * \sa  \ref SSMVAS
00337  */
00338 
00339 
00340 /**\addtogroup SSMXCT 
00341  * All storage manager operations on data must be done within the scope of
00342  * a transaction (ss_m::begin_xct, ss_m::commit_xct, ss_m::abort_xct,
00343  * ss_m::chain_xct). 
00344  *
00345  * A very few storage manager operations, such as formatting a volume, are
00346  * called outside the scope of a transaction and the storage manager begins
00347  * its own transaction to do the work.
00348  *
00349  * Operations that fail return an error indication and the storage 
00350  * manager assumes that the server will thereafter abort the 
00351  * transaction in which the error occurred, when abort is indicated.
00352  * Abort is indicated when eUSERABORT or eDEADLOCK is returned and 
00353  * when the erver chooses to abort rather than to work around the problem 
00354  * (whatever it might be, such as eRETRY).
00355  *
00356  * The storage manager does not enforce the aborting of any erroneous
00357  * transactions except, possibly, those that are in danger of 
00358  * running out of log space.
00359  * (This is done with the destructor of the prologue used on each call
00360  * to the storage manager, see next paragraph).
00361  *
00362  * It is always the server's responsibility to abort.
00363  * When the storage manager 
00364  * encounters a eLOGSPACEWARN condition (the log hasn't enough
00365  * space \e at \e this \e moment to abort the running transaction,
00366  * assuming a 1:1 ration of rollback-logging overhead to forward-processing
00367  * logging overhead), it does one of two things:
00368  * - passes the error code eLOGSPACEWARN up the call stack back to the server
00369  *   if the storage manager was constructed with no log-space-warning callback
00370  *   argument (see LOG_WARN_CALLBACK_FUNC, ss_m::ss_m).
00371  * - tries to abort a transaction before passing an error code back up
00372  *   the call stack to the server. Choosing a victim transaction to abort
00373  *   is done by the server in its log-space-warning callback function (passed
00374  *   in on ss_m::ss_m, q.v.
00375  *   Only if that callback function returns a non-null victim transaction
00376  *   and returns eUSERABORT does the storage manager abort that victim
00377  *   before returning eUSERABORT up the call stack. Any other
00378  *   error code returned by the callback function is just returned up
00379  *   the call stack.
00380  *
00381  * \section LOCKS Locks 
00382  *
00383  * The storage manager automatically acquires the 
00384  * necessary locks when the data are read or written.
00385  * The locks thus acquired are normally released at the end of a transaction,
00386  * thus, by default, transactions are two-phase and well-formed (degree 3).
00387  *
00388  * \subsection GRAN Lock Granularity
00389  * The fine-grained locks are normally used for records in files, but
00390  * provision is made for using coarser-grained locks.  The transaction
00391  * has a default lock level associated with it,
00392  * which governs the granularity of locks acquired by the storage manager
00393  * on behalf of the transaction.
00394  * The lock manager provides for lock escalation to coarser locks to
00395  * reduce the locking costs.  See \ref SSMLOCK and smlevel_0::concurrency_t. 
00396  *
00397  * Key-value locking is normally used for B+-Trees. (See \ref MOH1.)
00398  * R*-Trees normally use coarse-granularity locking.
00399  * The locking protocol used with an index is determined when the
00400  * index is created.  A transaction may acquire coarse (index-level)
00401  * locks with explicit calls to the lock manager, but by default, 
00402  * the granularity/level/protocol associated with the index is used.
00403  * See smlevel_0::concurrency_t. 
00404  *
00405  * \section DISTXCT Distributed Transactions
00406  * Storage manager transactions may be used as "threads" (to 
00407  * overload this term) of distributed transactions.  
00408  * Coordination of 2-phase commit must be done externally,
00409  * but the storage manager supports preparing the (local) transaction "thread" 
00410  * for two-phase commit, and it will log the necessary 
00411  * data for recovering in-doubt transactions.
00412  *
00413  * \section ATTACH Threads and Transactions
00414  * Transactions are not tied to storage manager threads (smthread_t, not
00415  * to be confused with a local "thread" of a distributed transaction) in any 
00416  * way other than that a transaction must be \e attached to a
00417  * thread while any storage manager work is being done on behalf of 
00418  * that transaction.   This is how the storage manager knows \e which
00419  * transaction is to acquire the locks and latches, etc.
00420  * But a thread can attach and detach from transactions at will, so
00421  * work may be performed by different threads each time the storage
00422  * manager is called on behalf of a given transaction; this allows the
00423  * server to keep a pool of threads to perform work and allows them to
00424  * perform work on behalf of any active transaction.
00425  *
00426  * \warning
00427  * While there are limited circumstances in which multiple threads can be
00428  * attached to the same transaction \e concurrently and perform storage 
00429  * manager operations on behalf of that transaction concurrently,
00430  * which is a hold-over from the original storage manager, this 
00431  * functionality will be deprecated soon.  The reason for this being
00432  * removed is that it is extremely difficult to handle errors internally
00433  * when multiple threads are attached to a transaction because 
00434  * partial rollback is impossible in the absence of multiple log streams
00435  * for a transaction.
00436  *
00437  * Under no circumstances may a thread attach to more than one transaction
00438  * at a time.
00439  *
00440  *
00441  * \section EXOTICA Exotica
00442  * The storage manager also provides 
00443  * - partial rollback (ss_m::save_work and ss_m::rollback_work), 
00444  *   which undoes actions but does not release locks,
00445  * - transaction chaining (ss_m::chain_xct), which commits, but retains locks
00446  *   and gives them to a new transaction,
00447  * - lock release (sm_quark_t, ss_m::unlock), allowing less-than-3-degree
00448  *   transactions.
00449  *
00450  *  To reduce the cost (particularly in logging) of loading databases,
00451  *  the storage manager provides for unlogged loading of stores.
00452  *  See \ref SSMSTORE.
00453  */
00454 
00455 
00456 /**\addtogroup SSMDEBUG 
00457  *
00458  * \section DEBUGLEV Build-time Debugging Options
00459  * At configure time, you can control which debugger-related options
00460  * (symbols, inlining, etc) with the debug-level options. See \ref CONFIGOPT.
00461  * \section SSMTRACE Tracing (--enable-trace)
00462  * When this build option is used, additional code is included in the build to
00463  * enable some limited tracing.  These C Preprocessor macros apply:
00464  * -W_TRACE
00465  *  --enable-trace defines this.
00466  * -FUNC
00467  *  Outputs the function name when the function is entered.
00468  * -DBG 
00469  *  Outputs the arguments.
00470  * -DBGTHRD 
00471  *  Outputs the arguments.
00472  *
00473  *  The tracing is controlled by these environment variables:
00474  *  -DEBUG_FLAGS: a list of file names to trace, e.g. "smfile.cpp log.cpp"
00475  *  -DEBUG_FILE: name of destination for the output. If not defined, the output
00476  *    is sent to cerr/stderr.
00477  *
00478  * See \ref CONFIGOPT.
00479  *  \note This tracing is not thread-safe, as it uses streams output.
00480  * \section SSMENABLERC Return Code Checking (--enable-checkrc)
00481  * If a w_rc_t is set but not checked with method is_error(), upon destruction the
00482  * w_rc_t will print a message to the effect "error not checked".
00483  * See \ref CONFIGOPT.
00484  *
00485  */
00486 
00487 /** \file sm_vas.h
00488  * \details
00489  * This is the include file that all value-added servers should
00490  * include to get the Shore Storage Manager API.
00491  *
00492  */
00493 /********************************************************************/
00494 
00495 class page_p;
00496 class xct_t;
00497 class device_m;
00498 class vec_t;
00499 class log_m;
00500 class lock_m;
00501 class btree_m;
00502 class file_m;
00503 class pool_m;
00504 class dir_m;
00505 class chkpt_m;
00506 class lid_m; 
00507 class sm_stats_cache_t;
00508 class option_group_t;
00509 class option_t;
00510 class prologue_rc_t;
00511 class rtree_m;
00512 class sort_stream_i;
00513 
00514 /**\addtogroup SSMSP  
00515  * A transaction may perform a partial rollback using savepoints.
00516  * The transaction populates a savepoint by calling ss_m::save_work,
00517  * then it may roll back to that point with ss_m::rollback_work.
00518  * Locks acquired between the save_work and rollback_work are \e not
00519  * released.
00520  */
00521 
00522 /**\brief A point to which a transaction can roll back.
00523  * \ingroup SSMSP
00524  *\details
00525  * A transaction an do partial rollbacks with
00526  * save_work  and rollback_work, which use this class to determine
00527  * how far to roll back.
00528  * It is nothing more than a log sequence number for the work done
00529  * to the point when save_work is called.
00530  */
00531 class sm_save_point_t : public lsn_t {
00532 public:
00533     NORET            sm_save_point_t(): _tid(0,0) {};
00534     friend ostream& operator<<(ostream& o, const sm_save_point_t& p) {
00535         return o << p._tid << ':' << (const lsn_t&) p;
00536     }
00537     friend istream& operator>>(istream& i, sm_save_point_t& p) {
00538         char ch;
00539         return i >> p._tid >> ch >> (lsn_t&) p;
00540     }
00541     tid_t            tid() const { return _tid; }
00542 private:
00543     friend class ss_m;
00544     tid_t            _tid;
00545 };
00546 
00547 /**\addtogroup SSMQK  
00548  * A quark is a marker in the transaction's list of acquired locks.
00549  * One may release all short-duration locks acquired since the quark was inserted 
00550  * into the list via sm_quark_t::open().
00551  * The lock manager modifies the locks acquired inside a quark
00552  * so that non-extent locks are no longer than short-duration.
00553  *
00554  * This is for experimentation only, and is \e not well-tested or supported.
00555  *
00556  * How used:
00557  * \code
00558  * sm_quark_t *q = new sm_quark_t;
00559  * q->open();  // inserts marker in transaction's list.
00560  * ...
00561  * q->close(); // frees short-duration locks to the marker.
00562  * delete q;
00563  * \endcode
00564  *
00565  * Deleting the quark without closing it causes it to be closed.
00566  * Quarks may \e not be used with multi-threaded transactions.
00567  *
00568  * Note that if a transaction has multiple threads attached when
00569  * a thread opens a quark, there is no way to determine where the
00570  * quark takes effect, and since it affects the locks acquired by
00571  * all threads of the transaction, it must be used very carefully
00572  * where multiply-threaded transactions are concerned.
00573  */
00574 
00575 /**\brief List of locks acquired by a transaction since
00576  * the quark was "opened".   
00577  * \ingroup SSMQK
00578  * \details
00579  * When a quark is closed (by calling close()), 
00580  * the release_locks parameter indicates if all short-duration read
00581  * locks acquired during the quark should be released.
00582  * \note Quarks are an experimental feature for use 
00583  * as a building block for a more general nested-transaction facility.
00584  *
00585  * \internal See lock_x.h
00586  */
00587 class sm_quark_t {
00588 public:
00589     NORET            sm_quark_t() {}
00590     NORET            ~sm_quark_t();
00591 
00592     rc_t            open();
00593     rc_t            close(bool release=true);
00594 
00595     tid_t            tid()const { return _tid; }
00596     operator         bool()const { return (_tid != tid_t::null); }
00597     friend ostream& operator<<(ostream& o, const sm_quark_t& q);
00598     friend istream& operator>>(istream& i, sm_quark_t& q);
00599 
00600 private:
00601     friend class ss_m;
00602     tid_t            _tid;
00603 
00604     // disable
00605     sm_quark_t(const sm_quark_t&);
00606     sm_quark_t& operator=(const sm_quark_t&);
00607 
00608 };
00609 
00610 class sm_store_info_t;
00611 class log_entry;
00612 class coordinator;
00613 class tape_t;
00614 /**\brief \b This \b is \b the \b SHORE \b Storage \b Manager \b API.
00615  *\details
00616  * Most of the API for using the storage manager is through this
00617  * interface class.
00618  */
00619 class ss_m : public smlevel_top 
00620 {
00621     friend class pin_i;
00622     friend class sort_stream_i;
00623     friend class prologue_rc_t;
00624     friend class log_entry;
00625     friend class coordinator;
00626     friend class tape_t;
00627 public:
00628 
00629     typedef smlevel_0::LOG_WARN_CALLBACK_FUNC LOG_WARN_CALLBACK_FUNC;
00630     typedef smlevel_0::LOG_ARCHIVED_CALLBACK_FUNC LOG_ARCHIVED_CALLBACK_FUNC;
00631     typedef smlevel_0::ndx_t ndx_t;
00632     typedef smlevel_0::concurrency_t concurrency_t;
00633     typedef smlevel_1::xct_state_t xct_state_t;
00634 
00635     typedef sm_store_property_t store_property_t;
00636 
00637 #if COMMENT
00638     //
00639     // Below is most of the interface for the SHORE Storage Manager.
00640     // The rest is located in pin.h, scan.h, and smthread.h
00641     //
00642 
00643     //
00644     // TEMPORARY FILES/INDEXES
00645     //
00646     // When a file or index is created there is a tmp_flag parameter
00647     // that when true indicates that the file is temporary.
00648     // Operations on a temporary file are not logged and the
00649     // file will be gone the next time the volume is mounted.
00650     //
00651     // TODO: IMPLEMENTATION NOTE on Temporary Files/Indexes:
00652     //        Temp files cannot be trusted after transaction abort.
00653     //            They should be marked for removal.
00654     //
00655     // CODE STRUCTURE:
00656     //    Almost all ss_m functions begin by creating a prologue object
00657     //    whose constructor and descructor check for many common errors.
00658     //    In addition most ss_m::OP() functions now call an ss_m::_OP()
00659     //    function to do the real work.  The ss_m::OP functions should
00660     //    not be called by other ss_m functions, instead the corresponding
00661     //    ss_m::_OP function should be used.
00662     //
00663 
00664 #endif /* COMMENT */
00665 
00666   public:
00667     /**\brief Add storage manager options to the given options group.
00668      *\ingroup SSMINIT
00669      *\details
00670      * @param[in] grp The caller's option group, to which the
00671      * storage manager's options will be added for processing soon.
00672      *
00673      * Before the ss_m constructor can be called, setup_options
00674      * \b must be called.  This will install the storage manager's options and
00675      * initialize any that are not required.
00676      * Once all required options have been set, an ss_m can be constructed.
00677      *
00678      *\note This is not thread-safe.  The application (server) must prevent
00679      * concurrent calls to setup_options.
00680      */
00681     static rc_t setup_options(option_group_t* grp);
00682 
00683     /**\brief  Initialize the storage manager.
00684      * \ingroup SSMINIT
00685      * \details
00686      * @param[in] warn   A callback function. This is called 
00687      * when/if the log is in danger of becoming "too full".
00688      * @param[in] get   A callback function. This is called 
00689      * when the storage manager needs an archived log file to be restored.
00690      *
00691      * When an ss_m object is created, the storage manager initializes itself
00692      * and,
00693      * if the sthreads package has not already been initialized by virtue
00694      * of an sthread_t running, the sthreads package is initialized now.
00695      *
00696      * The log is read and recovery is performed (\ref MHLPS), 
00697      * and control returns to
00698      * the caller, after which time
00699      * storage manager threads (instances of smthread_t) may be constructed and
00700      * storage manager may be used.
00701      *
00702      * The storage manager is used by invoking its static methods.  
00703      * You may use them as follows:
00704      * \code
00705      * ss_m *UNIQ = new ss_m();
00706      *
00707      * W_DO(UNIQ->mount_dev(...))
00708      *     // or
00709      * W_DO(ss_m::mount_dev(...))
00710      * \endcode
00711      * ).
00712      *
00713      * Only one ss_m object may be extant at any time. If you try
00714      * to create another while the one exists, a fatal error will occur
00715      * (your program will choke with a message about your mistake).
00716      *
00717      * The callback argument given to the storage manager constructor
00718      * is called when the storage manager determines that it is in danger
00719      * of running out of log space.  Heuristics are used to guess when
00720      * this is the case.  
00721      *
00722      * If the function \a warn archives and removes log files, the function
00723      * \a get must be provided to restore those log files when the
00724      * storage manager needs them.
00725      *
00726      * For details and examples, see  \ref smlevel_0::LOG_WARN_CALLBACK_FUNC, 
00727      *  \ref smlevel_0::LOG_ARCHIVED_CALLBACK_FUNC, and 
00728      *  \ref LOGSPACE.
00729      */
00730     ss_m(LOG_WARN_CALLBACK_FUNC warn=NULL, LOG_ARCHIVED_CALLBACK_FUNC get=NULL);
00731 
00732     /**\brief  Shut down the storage manager.
00733      * \ingroup SSMINIT
00734      * \details
00735      * When the storage manager object is deleted, it shuts down.
00736      * Thereafter it is not usable until another ss_m object is 
00737      * constructed.
00738      */
00739     ~ss_m();
00740 
00741     /**\brief Cause the storage manager's shutting down do be done cleanly 
00742      * or to simulate a crash.
00743      * \ingroup SSMINIT
00744      * \details
00745      * @param[in] clean   True means shut down gracefully, false means simulate a crash.
00746      *
00747      * When the storage manager's destructor is called
00748      * the buffer pool is flushed to disk, unless this method is called 
00749      * with \a clean == \e false.
00750      *
00751      * \note If this method is used, it
00752      * must be called after the storage manager is 
00753      * constructed if it is to take effect. Each time the storage
00754      * manager is constructed, the state associated with this is set
00755      * to \e true, i.e., "shut down properly".
00756      *
00757      * \note This method is not thread-safe, only one thread should use this
00758      * at any time, presumably just before shutting down.
00759      */
00760     static void         set_shutdown_flag(bool clean);
00761 
00762     /**\brief Notify storage manager when a log file was archived by a
00763      * LOG_WARN_CALLBACK_FUNC.
00764      *
00765      * The arguments:
00766      * @param[in] logfile   Character string name of file archived.
00767      */
00768     static rc_t         log_file_was_archived(const char * logfile);
00769 
00770 private:
00771     void                _construct_once(LOG_WARN_CALLBACK_FUNC x=NULL,
00772                                            LOG_ARCHIVED_CALLBACK_FUNC y=NULL);
00773     void                _destruct_once();
00774 
00775 
00776 public:
00777     /**\addtogroup SSMXCT
00778      *
00779      * All work performed on behalf of a transaction must occur while that
00780      * transaction is "attached" to the thread that performs the work.
00781      * Creating a transaction attaches it to the thread that creates the transaction. 
00782      * The thread may detach from the transaction and attach to another.
00783      * Multiple threads may attach to a single transaction and do work in certain circumstances.   See \ref SSMMULTIXCT
00784      *
00785      * 
00786      */
00787     /**\brief Begin a transaction 
00788      *\ingroup SSMXCT
00789      * @param[in] timeout   Optional, controls blocking behavior.
00790      * \details
00791      *
00792      * Start a new transaction and "attach" it to this thread. 
00793      * No running transaction may be attached to this thread.
00794      * 
00795      * Storage manager methods that must block (e.g., to acquire a lock) 
00796      * will use the timeout given.  
00797      * The default timeout is the one associated with this thread.
00798      *
00799      * \sa timeout_in_ms
00800      */
00801     static rc_t           begin_xct(
00802         timeout_in_ms            timeout = WAIT_SPECIFIED_BY_THREAD);
00803 
00804     /**\brief Begin an instrumented transaction. 
00805      *\ingroup SSMXCT
00806      * @param[in] stats   Pointer to an allocated statistics-holding structure.
00807      * @param[in] timeout   Optional, controls blocking behavior.
00808      * \details
00809      * No running transaction may be already attached to this thread.
00810      * A new transaction is started and attached to the running thread.
00811      *
00812      * The transaction will be instrumented.
00813      * This structure is updated by the storage manager whenever a thread
00814      * detaches from this transaction.  The activity recorded during
00815      * the time the thread is attached to the transcation will be stored in
00816      * the per-transaction statistics.
00817      * \attention It is the client's 
00818      * responsibility to delete the statistics-holding structure.
00819      * 
00820      * Storage manager methods that must block (e.g., to acquire a lock) 
00821      * will use the timeout given.  
00822      * The default timeout is the one associated with this thread.
00823      *
00824      * \sa timeout_in_ms
00825      */
00826     static rc_t           begin_xct(
00827         sm_stats_info_t*         stats,  // allocated by caller
00828         timeout_in_ms            timeout = WAIT_SPECIFIED_BY_THREAD);
00829 
00830     /**\brief Begin a transaction and return the transaction id.
00831      *\ingroup SSMXCT
00832      * @param[out] tid      Transaction id of new transaction.
00833      * @param[in] timeout   Optional, controls blocking behavior.
00834      * \details
00835      *
00836      * No running transaction may be attached to this thread.
00837      * 
00838      * Storage manager methods that must block (e.g., to acquire a lock) 
00839      * will use the timeout given.  
00840      * The default timeout is the one associated with this thread.
00841      *
00842      * \sa timeout_in_ms
00843      */
00844     static rc_t           begin_xct(
00845         tid_t&                   tid,
00846         timeout_in_ms            timeout = WAIT_SPECIFIED_BY_THREAD);
00847 
00848     /**\addtogroup SSM2PC  
00849      * The storage manager contains support for externally-coordinated
00850      * transactions that use
00851      * two-phase-commit with presumed abort.
00852      * The server must provide the coordination and the coordinator is
00853      * assumed to have its own stable storage, and it is assumed to recover
00854      * from failures in a "short time", the precise meaning of which is given below.
00855      * A prepared transaction, like an active transaction,
00856      * consumes log space and holds locks.
00857      * Even if a prepared transaction does not hold locks needed by 
00858      * other transactions, it consumes resources in a way that can interfere 
00859      * with other transactions.
00860      * If a prepared transaction remains in the system for a long time 
00861      * while other transactions are running, eventually the storage 
00862      * manager needs the log space used (reserved) by the prepared transaction.
00863      * A coordinator must resolve its prepared transactions
00864      * before the storage manager effectively runs out of 
00865      * log space for other transactions in the system.
00866      * The amount of time involved is a function of the size of the log
00867      * and of the demands of the other transactions in the system.
00868      *
00869      * For the purpose of this discussion, the portion of a global 
00870      * transaction that involves a single Shore Storage Manager transaction is 
00871      * called a thread of the global transaction.
00872      *
00873      * A Shore transaction participates as a thread of a global transaction
00874      * as follows:
00875      - Start a storage-manager transaction with ss_m::begin_xct.
00876      - Acquire a global transaction identifier from the coordinator.
00877      - Indicate to the storage manager that this transaction is a 
00878      thread of a global transaction, and associate the global transaction 
00879      identifier with this thread by calling ss_m::enter_2pc.
00880      - Associate a coordinator with the transaction for recovery 
00881      purposes, by calling ss_m::set_coordinator.
00882      - Prepare the thread of the transaction and get the storage manager's 
00883      vote with ss_m::prepare_xct.  
00884      It is an error to commit a global transaction thread without first 
00885      preparing it.  It is an error to do anything else 
00886      in a transaction after it is prepared, except to end 
00887      the transaction or retry the prepare (to get the vote again).
00888      - Convey the vote to the coordinator, and determine the transaction's 
00889      fate from the coordinator.
00890      - End the thread with ss_m::commit_xct or ss_m::abort_xct.
00891      *
00892      * The storage manager 
00893      * logs the minimal information required to effect a vote of the
00894      * transaction threads that are storage manager transactions,
00895      * and to recover such in-doubt transactions after restart.
00896      * Thus, after a crash/restart, the server may query the storage manager
00897      * about in-doubt (prepared) transactions with ss_m::query_prepared_xct,
00898      * which tells the caller the number and global transaction IDs associated
00899      * with prepared transactions.
00900      * Using this, the server contacts the coordinator and resumes the
00901      * voting.
00902      * The server may find the local transaction IDs and use ss_m::tid_to_xct
00903      * to attach these transactions  and to resolve them.
00904      * 
00905      * Commit and abort of read-only transactions are the same,
00906      * as these transactions have no log entries.  Preparing read-only transactions
00907      * causes them to commit/abort and the vote returned is vote_readonly.
00908      * Once this vote is communicated to the coordinator and the coordinator
00909      * records it on stable storage, there is no need to involve this thread in
00910      * any further processing.  For this reason,
00911      * read-only transactions do not appear as prepared transactions at
00912      * recovery time.
00913      * 
00914      */
00915 
00916     /**\brief Make the attached transaction a thread of a distributed transaction.
00917      *\ingroup SSM2PC
00918      *
00919      * @param[in] gtid    Global transaction ID to associate with this transaction.  This will be logged when the transaction is prepared.
00920      * 
00921      * \note This can be called at most once for a given transaction.
00922      * The transaction must be attached to the calling thread.
00923      * No other threads may be attached to the transaction.
00924      */
00925     static rc_t           enter_2pc(const gtid_t &gtid); 
00926     /**\brief Assign a coordinator handle to this distributed transaction.
00927      *\ingroup SSM2PC
00928      * @param[in] h      Handle of the coordinator.  Not interpreted by
00929      * the storage manager.
00930      *
00931      * The storage manager associates this server handle with the transaction 
00932      * so that when the transaction is prepared, this information is 
00933      * written to the log. Upon recovery, if this transaction is still in doubt,
00934      * the value-added server can query the 
00935      * storage manager for in-doubt transactions, get their server handles,
00936      * and resolve the transactions.
00937      * See query_prepared_xct and recover_2pc.
00938      */
00939     static rc_t           set_coordinator(const server_handle_t &h); 
00940 
00941     /**\brief Prepare a thread of a distributed transaction.
00942      *\ingroup SSM2PC
00943      * @param[in] stats     Pointer to an allocated statistics-holding 
00944      *                      structure.
00945      * @param[out] vote     This thread's vote.
00946      *
00947      * The storage manager will prepare the attached transaction (a thread
00948      * of a distributed transaction) for commit.
00949      * If this transaction has performed no logged updates, the 
00950      * vote returned will be vote_readonly.
00951      * If this transaction can commit, the vote returned will be vote_commit.
00952      * If an error occurs during the prepare, the vote will be vote_abort.
00953      *
00954      * If the transaction is being instrumented, the 
00955      * statistics-holding structure will be returned to the caller, 
00956      * and the caller is responsible for its deallocation.
00957      */
00958     static rc_t           prepare_xct(
00959                             sm_stats_info_t*&         stats, 
00960                             vote_t&                   vote); 
00961 
00962     /**\brief Prepare a thread of a distributed transaction.
00963      *\ingroup SSM2PC
00964      * @param[out] vote     This thread's vote. See \ref w_base_t::vote_t.
00965      *
00966      * The storage manager will prepare the attached transaction (a thread
00967      * of a distributed transaction) for commit.
00968      * If this transaction has performed no logged updates, the 
00969      * vote returned will be vote_readonly.
00970      * If this transaction can commit, the vote returned will be vote_commit.
00971      * If an error occurs during the prepare, the vote will be vote_abort.
00972      */
00973     static rc_t           prepare_xct(vote_t &vote); 
00974 
00975     /**\brief Force the transaction to vote "read-only" in a two-phase commit. 
00976      *\ingroup SSM2PC
00977      * \details
00978      * This will override the storage manager's determination of 
00979      * whether this thread of a distributed transaction is read-only, which is
00980      * based on whether the local transaction thread logged anything. This
00981      * method may be useful if the local transaction rolled back to 
00982      * a savepoint.
00983      * See  \ref w_base_t::vote_t.
00984      */
00985     static rc_t           force_vote_readonly(); 
00986 
00987     /**\brief Given a global transaction id, find the local prepared 
00988      * transaction associated with it. 
00989      *\ingroup SSM2PC
00990      * @param[in] gtid     A global transaction ID (an opaque quantity 
00991      * to the storage manager).
00992      * @param[in] mayblock Not used.
00993      * @param[out] local   Return the transaction ID of the prepared 
00994      * SM transaction.
00995      * \details
00996      * Searches the transaction list for a prepared transaction with the given
00997      * global transaction id. If found, it returns a reference to the 
00998      * local transaction.  The transaction is attached to the running
00999      * thread before it is returned.
01000      */
01001     static rc_t           recover_2pc(const gtid_t & gtid,
01002         bool                      mayblock,
01003         tid_t &                   local
01004         );
01005 
01006     /**\brief  Return the number of prepared transactions.
01007      *\ingroup SSM2PC
01008      * @param[out] numtids   The number of in-doubt transactions.
01009      * \details
01010      * Used by a server at start-up, after recovery, to find out if
01011      * there are any in-doubt transactions.  If so, the server must
01012      * use the second form of query_prepared_xct to find the global
01013      * transaction IDs of these in-doubt transactions.
01014      */
01015     static rc_t           query_prepared_xct(int &numtids);
01016 
01017     /**\brief  Return the global transaction IDs of in-doubt transactions. 
01018      *\ingroup SSM2PC
01019      * @param[in] numtids   The number of global transaction ids in the list.
01020      * @param[in] l   The caller-provided list into which to write the 
01021      * global transaction-ids.
01022      * \details
01023      * Used by a server at start-up, after recovery, to find out the
01024      * global transaction IDs of the prepared transactions.  The storage
01025      * manager fills in the first numtids entries of the pre-allocated list.
01026      * The server may have first called the first form of query_prepared_xct
01027      * to find out how many such transactions there are after recovery.
01028      *
01029      * \attention Read-only transactions 
01030      * do not appear as in-doubt transactions. Because they did not
01031      * generate any log records, they will not be "discovered" by analysis.
01032      * The server must determine that any thread of a global transaction that
01033      * does not appear to be in doubt was a read-only thread or
01034      * it never prepared and thus has been aborted.
01035      * Read-only transactions that were prepared would have voted read-only,
01036      * and if the coordinator recorded that vote on stable storage, it
01037      * should not be concerned with these transaction threads any further.
01038      * If the coordinator does not have this information recorded, the
01039      * transaction thread could have been an aborted non-read-only transaction,
01040      * so the coordinator must, in this case, presume that the thread aborted
01041      * and thus make the global transaction abort.
01042      */
01043     static rc_t           query_prepared_xct(int numtids, gtid_t l[]);
01044 
01045 
01046     /**\brief Commit a transaction.
01047      *\ingroup SSMXCT
01048      * @param[in] lazy   Optional, controls flushing of log.
01049      * @param[out] plastlsn   If non-null, this is a pointer to a
01050      *                    log sequence number into which the storage
01051      *                    manager writes the that of the last log record
01052      *                    inserted for this transaction.
01053      * \details
01054      *
01055      * Commit the attached transaction and detach it, destroy it.
01056      * If \a lazy is true, the log is not synced.  This means that
01057      * recovery of this transaction might not be possible.
01058      */
01059     static rc_t           commit_xct(
01060                                      bool   lazy = false,
01061                                      lsn_t* plastlsn=NULL);
01062 
01063     /**\brief Commit an instrumented transaction and get its statistics.
01064      *\ingroup SSMXCT
01065      * @param[out] stats   Get a copy of the statistics for this transaction.
01066      * @param[in] lazy   Optional, controls flushing of log.
01067      * @param[out] plastlsn   If non-null, this is a pointer to a
01068      *                    log sequence number into which the storage
01069      *                    manager writes the that of the last log record
01070      *                    inserted for this transaction.
01071      * \details
01072      *
01073      * Commit the attached transaction and detach it, destroy it.
01074      * If \a lazy is true, the log is not synced.  This means that
01075      * recovery of this transaction might not be possible.
01076      */
01077     static rc_t            commit_xct(
01078                                     sm_stats_info_t*& stats, 
01079                                     bool              lazy = false,
01080                                     lsn_t*            plastlsn=NULL);
01081 
01082     /**\brief Commit an instrumented transaction and start a new one.
01083      *\ingroup SSMXCT
01084      * @param[out] stats   Get a copy of the statistics for the first transaction.
01085      * @param[in] lazy   Optional, controls flushing of log.
01086      * \details
01087      *
01088      * Commit the attached transaction and detach it, destroy it.
01089      * Start a new transaction and attach it to this thread.
01090      * \note \e The \e new 
01091      * \e transaction \e inherits \e the \e locks \e of \e the \e old 
01092      * \e transaction.
01093      *
01094      * If \a lazy is true, the log is not synced.  This means that
01095      * recovery of this transaction might not be possible.
01096      */
01097     static rc_t            chain_xct(
01098         sm_stats_info_t*&         stats,    /* in w/new, out w/old */
01099         bool                      lazy = false);  
01100 
01101     /**\brief Commit a transaction and start a new one, inheriting locks.
01102      *\ingroup SSMXCT
01103      * @param[in] lazy   Optional, controls flushing of log.
01104      * \details
01105      *
01106      * Commit the attached transaction and detach it, destroy it.
01107      * Start a new transaction and attach it to this thread.
01108      * \note \e The \e new 
01109      * \e transaction \e inherits \e the \e locks \e of \e the \e old 
01110      * \e transaction.
01111      *
01112      * If \a lazy is true, the log is not synced.  This means that
01113      * recovery of the committed transaction might not be possible.
01114      */
01115     static rc_t            chain_xct(bool lazy = false);  
01116 
01117     /**\brief Abort an instrumented transaction and get its statistics.
01118      *\ingroup SSMXCT
01119      * @param[out] stats   Get a copy of the statistics for this transaction.
01120      * \details
01121      *
01122      * Abort the attached transaction and detach it, destroy it.
01123      */
01124     static rc_t            abort_xct(sm_stats_info_t*&  stats);
01125     /**\brief Abort a transaction.
01126      *\ingroup SSMXCT
01127      * \details
01128      *
01129      * Abort the attached transaction and detach it, destroy it.
01130      */
01131     static rc_t            abort_xct();
01132 
01133     /**\brief Populate a save point.
01134      *\ingroup SSMSP
01135      * @param[out] sp   An sm_save_point_t owned by the caller.
01136      *\details
01137      * Store in sp the needed information to be able to roll back 
01138      * to this point. 
01139      * For use with rollback_work.
01140      * \note Only one thread may be attached to a transaction when this
01141      * is called.
01142      */
01143     static rc_t            save_work(sm_save_point_t& sp);
01144 
01145     /**\brief Roll back to a savepoint.
01146      *\ingroup SSMSP
01147      * @param[in] sp   An sm_save_point_t owned by the caller and
01148      * populated by save_work.
01149      *\details
01150      * Undo everything that was 
01151      * done from the time save_work was called on this savepoint.
01152      * \note Locks are not freed.
01153      *
01154      * \note Only one thread may be attached to a transaction when this
01155      * is called.
01156      */
01157     static rc_t            rollback_work(const sm_save_point_t& sp);
01158 
01159     /**\brief Return the number of transactions in active state.
01160      *\ingroup SSMXCT
01161      * \details
01162      * While this is thread-safe, the moment a value is returned, it could
01163      * be out of date.
01164      * Useful only for debugging.
01165      */
01166     static w_base_t::uint4_t     num_active_xcts();
01167 
01168     /**\brief Attach the given transaction to the currently-running smthread_t.
01169      *\ingroup SSMXCT
01170      * \details
01171      * It is assumed that the currently running thread is an smthread_t.
01172      */
01173     static void           attach_xct(xct_t *x) { me()->attach_xct(x); }
01174 
01175     /**\addtogroup SSMMULTIXCT 
01176      * 
01177      * Certain operations may be performed while more than one
01178      * thread is attached to a transaction (this functionality is
01179      * soon to be deprecated).
01180      * Any number of attached threads may be read-only.
01181      * The kinds of updates that can be made by multiple threads are limited by
01182      * the need to avoid latch-mutex and latch-latch deadlocks. 
01183      *
01184      * There are several reasons for this.
01185      * 1) The multiple threads are not protected from each other by locks.
01186      * 2) Interleaving of top-level actions is not supported with rollback;
01187      * this means that for the duration of a top-level action, a thread needs
01188      * access to the log that excludes all other threads in 
01189      * the same transaction.
01190      *
01191      * The internal logging protocol is this:
01192      * T1: latch page, log update. Logging requires acquiring a mutex
01193      * on the xct's log buffer.
01194      * T2: performing any top-level action, acquires the mutex on the
01195      * xct's log buffer before doing the action (latching the page).
01196      *
01197      * Thus, anything involving top-level actions is suspect.  B-trees
01198      * use top-level actions, as does file-page allocation, and creation/
01199      * destruction of stores (files, indexes).  Thus, just about
01200      * any kind of concurrent updates on the same page
01201      * in the same transaction is problematic, and just about any update
01202      * can result in latching extent-map or store-map pages.
01203      * This activity could be disallowed by enforcing a strict 
01204      * rule that at most  one update operation can be going on 
01205      * in a transaction at any time, however this is too restrictive.
01206      *
01207      * Multiple updating threads can
01208      * work \b if \b the \b data \b are \b partitioned by volume.
01209      * So a well-behaved server may use multiple-threaded transactions
01210      * to do updates as long as the updates are on different \b volumes.
01211      * It might also allow read-only transaction threads to be
01212      * concurrent with a single updating thread.
01213      *
01214      * Savepoints and partial rollback may \e not be used with 
01215      * multi-threaded transactions. This is not enforced by the storage
01216      * manager; it is poor behavior on the part of a server.
01217      * For example, the behavior of the following is undefined:
01218      * - thread 1: attach, read,      read,   read, ...
01219      * - thread 2: attach, save work, update, rollback
01220      * If the two threads are reading and possibly updating the same 
01221      * data, the results are timing-dependent and could produce a latch-
01222      * latch or latch-mutex deadlock.
01223      *
01224      * Ongoing research at DIAS is investigating ways to extend the usefulness
01225      * of parallelism within a transaction (multi-threaded transactions).
01226      * Current thoughts about this are for servers to coordinate multiple 
01227      * transactions using two-phase commit or an optimized version
01228      * of commit and abort for groups of local transactions.
01229      */
01230 
01231     /**\brief Detach any attached from the currently-running smthread_t.
01232      *\ingroup SSMXCT
01233      * \details
01234      * Sever the connection between the running thread and the transaction.
01235      * This allow the running thread to attach a different 
01236      * transaction and to perform work in its behalf.
01237      */
01238     static void           detach_xct() { xct_t *x = me()->xct();
01239                                         if(x) me()->detach_xct(x); }
01240 
01241     /**\brief Get the transaction structure for a given a transaction id.
01242      *\ingroup SSMXCT
01243      * @param[in] tid   Transaction ID.
01244      *\details
01245      * Return a pointer to the storage manager's transaction structure.
01246      * Can be used with detach_xct and attach_xct.
01247      */
01248     static xct_t*          tid_to_xct(const tid_t& tid);
01249     /**\brief Get the transaction ID for a given a transaction structure.
01250      *\ingroup SSMXCT
01251      * @param[in] x   Pointer to transaction structure.
01252      *\details
01253      * Return the transaction ID for the given transaction.
01254      */
01255     static tid_t           xct_to_tid(const xct_t* x);
01256 
01257     /**\brief Print transaction information to an output stream.
01258      *\ingroup SSMAPIDEBUG
01259      * @param[in] o   Stream to which to write the information.
01260      * \details
01261      * This is for debugging only, and is not thread-safe. 
01262      */
01263     static rc_t            dump_xcts(ostream &o);
01264 
01265     /**\brief Get the transaction state for a given transaction (structure).
01266      *\ingroup SSMXCT
01267      * @param[in] x   Pointer to transaction structure.
01268      * \details
01269      * Returns the state of the transaction (active, prepared). It is
01270      * hard to get the state of an aborted or committed transaction, since
01271      * their structures no longer exist.
01272      */
01273     static xct_state_t     state_xct(const xct_t* x);
01274 
01275     /**\brief Return the amount of log this transaction would consume
01276      * if it rolled back.
01277      *\ingroup SSMXCT
01278      *
01279      * If a transaction aborts with eOUTOFLOGSPACE this function can
01280      * be used in conjunction with xct_reserve_log_space to
01281      * pre-allocate the needed amount of log space before retrying.
01282      */
01283     static smlevel_0::fileoff_t        xct_log_space_needed();
01284 
01285     /**\brief Require the specified amount of log space to be
01286      * available for this transaction before continuing.
01287      *\ingroup SSMXCT
01288      *
01289      * If a transaction risks running out of log space it can
01290      * pre-request some or all of the needed amount before starting in
01291      * order to improve its chances of success. Other new transactions
01292      * will be unable to acquire log space before this request is
01293      * granted (existing ones will be able to commit, unless they also
01294      * run out of space, because that tends to free up log space and
01295      * avoids wasting work).
01296      */
01297     static rc_t            xct_reserve_log_space(fileoff_t amt);
01298     
01299     /**\brief Get the locking granularity for the attached transaction.
01300      * \ingroup SSMLOCK
01301      */
01302     static concurrency_t   xct_lock_level();
01303     /**\brief Set the default locking level for the attached transaction.
01304      * \ingroup SSMLOCK
01305      * \details
01306      * @param[in] l  The level to use for the balance of this transaction.
01307      * Legitimate values are t_cc_record,  t_cc_page,  t_cc_file.
01308      *
01309      * \note Only one thread may be attached to the transaction when this
01310      * is called. If more than one thread is attached, a fatal error
01311      * will ensue.
01312      */
01313     static void            set_xct_lock_level(concurrency_t l);
01314 
01315     /**\brief Collect transaction information in a virtual table.
01316      * \ingroup SSMVTABLE
01317      * \details
01318      * @param[out] v  The virtual table to populate.
01319      * @param[in] names_too  If true, make the 
01320      *            first row of the table a list of the attribute names.
01321      *
01322      * All attribute values will be strings.
01323      * The virtual table v can be printed with its output operator
01324      * operator<< for ostreams.
01325      *
01326      * \attention Not atomic. Can yield stale data. 
01327      */
01328     static rc_t            xct_collect(vtable_t&v, bool names_too=true);
01329 
01330     /**\brief Collect buffer pool information in a virtual table.
01331      * \ingroup SSMVTABLE
01332      * \details
01333      * @param[out] v  The virtual table to populate.
01334      * @param[in] names_too  If true, make the 
01335      *            first row of the table a list of the attribute names.
01336      *
01337      * \attention Be wary of using this with a large buffer pool.
01338      *
01339      * All attribute values will be strings.
01340      * The virtual table v can be printed with its output operator
01341      * operator<< for ostreams.
01342      *
01343      * \attention Not atomic. Can yield stale data. 
01344      */
01345     static rc_t            bp_collect(vtable_t&v, bool names_too=true);
01346 
01347     /**\brief Collect lock table information in a virtual table.
01348      * \ingroup SSMVTABLE
01349      * \details
01350      * @param[out] v  The virtual table to populate.
01351      * @param[in] names_too  If true, make the 
01352      *            first row of the table a list of the attribute names.
01353      *
01354      * All attribute values will be strings.
01355      * The virtual table v can be printed with its output operator
01356      * operator<< for ostreams.
01357      *
01358      * \attention Not atomic. Can yield stale data. 
01359      * Cannot be used in a multi-threaded-transaction context.
01360      */
01361     static rc_t            lock_collect(vtable_t&v, bool names_too=true);
01362 
01363     /**\brief Collect thread information in a virtual table.
01364      * \ingroup SSMVTABLE
01365      * \details
01366      * @param[out] v  The virtual table to populate.
01367      * @param[in] names_too  If true, make the 
01368      *            first row of the table a list of the attribute names.
01369      *
01370      * All attribute values will be strings.
01371      * The virtual table v can be printed with its output operator
01372      * operator<< for ostreams.
01373      *
01374      * \attention Not thread-safe. Can yield stale data. 
01375      */
01376     static rc_t            thread_collect(vtable_t&v, bool names_too=true);
01377 
01378     /**\brief Take a checkpoint.
01379      * \ingroup SSMAPIDEBUG
01380      * \note For debugging only!
01381      *
01382      * Force the storage manager to take a checkpoint.
01383      * Checkpoints are fuzzy : they can be taken while most other
01384      * storage manager activity is happening, even though they have
01385      * to be serialized with respect to each other, and with respect to
01386      * a few other activities.
01387      *
01388      * This is thread-safe.
01389      */
01390     static rc_t            checkpoint();
01391 
01392     /**\brief Force the buffer pool to flush its pages to disk.
01393      * \ingroup SSMAPIDEBUG
01394      * @param[in] invalidate   True means discard pages after flush.
01395      * \note For debugging only!
01396      * \attention Do not call force_buffers with anything pinned.
01397      * You may cause latch-latch deadlocks, as this method has
01398      * to scan the entire buffer pool and possibly EX-latch pages to prevent
01399      * others from updating while it forces to disk.
01400      * Since the page-order is essentially random, we cannot
01401      * preclude latch-latch deadlocks with other threads.
01402      */
01403     static rc_t            force_buffers(bool invalidate = false);
01404 
01405     /**\brief Force the buffer pool to flush the volume header page(s)
01406      * to disk.
01407      * \ingroup SSMAPIDEBUG
01408      * @param[in] vid   ID of the volume of interest
01409      * \note For debugging only!
01410      * \attention Do not call force_vol_hdr_buffers with anything pinned.
01411      * You could cause latch-latch deadlocks, as this method has
01412      * to scan the entire buffer pool and possibly EX-latch some pages.
01413      * Since the page-order is essentially random, we cannot
01414      * preclude latch-latch deadlocks with other threads.
01415      */
01416     static rc_t            force_vol_hdr_buffers( const vid_t&   vid);
01417 
01418     /**\brief Force the buffer pool to flush to disk all pages
01419      * for the given store.
01420      * \ingroup SSMAPIDEBUG
01421      * @param[in] stid   Store whose pages are to be flushed.
01422      * @param[in] invalidate   True means discard the pages after flushing.
01423      * \note For debugging only!
01424      * \attention Do not call force_store_buffers with anything pinned.
01425      * You may cause latch-latch deadlocks, as this method has
01426      * to scan the entire buffer pool and, if invalide==true,
01427      * EX-latch pages to prevent others from updating 
01428      * while it forces to disk.
01429      * Since the page-order is essentially random, we cannot
01430      * preclude latch-latch deadlocks with other threads.
01431      */
01432     static rc_t            force_store_buffers(const stid_t & stid,
01433                                                bool invalidate);
01434 
01435     /**\cond skip 
01436      * Do not document. Very un-thread-safe.
01437      */
01438     static rc_t            dump_buffers(ostream &o);
01439     static rc_t            dump_locks(ostream &o);
01440     static rc_t            dump_locks(); // defaults to std::cout
01441     static rc_t            dump_exts(ostream &o, 
01442         vid_t                    v, 
01443         extnum_t                 start, 
01444         extnum_t                 end);
01445 
01446     static rc_t            dump_stores(ostream &o, 
01447         vid_t                    v, 
01448         int                      start, 
01449         int                      end);
01450 
01451     static rc_t            dump_histo(ostream &o, bool locked);
01452 
01453     static rc_t            snapshot_buffers(
01454         u_int&                 ndirty, 
01455         u_int&                 nclean, 
01456         u_int&                 nfree,
01457         u_int&                 nfixed);
01458     /**\endcond skip */
01459 
01460     /**\brief Get a copy of the statistics from an attached instrumented transaction.
01461      * \ingroup SSMXCT
01462      * \details
01463      * @param[out] stats Returns a copy of the statistics for this transaction.
01464      * @param[in] reset  If true, the statistics for this transaction will be zeroed.
01465      */
01466     static rc_t            gather_xct_stats(
01467         sm_stats_info_t&       stats, 
01468         bool                   reset = false);
01469 
01470     /**\brief Get a copy of the global statistics.
01471      * \ingroup SSMSTATS
01472      * \details
01473      * @param[out] stats A pre-allocated structure.
01474      */
01475     static rc_t            gather_stats(
01476         sm_stats_info_t&       stats
01477         );
01478 
01479     /**\brief Get a copy of configuration-dependent information.
01480      * \ingroup OPT
01481      * \details
01482      * @param[out] info A pre-allocated structure.
01483      */
01484     static rc_t            config_info(sm_config_info_t& info);
01485 
01486     /**\brief Set sleep time before I/O operations.
01487      * \ingroup SSMVOL
01488      * \details
01489      * This method sets a milli_sec delay to occur before 
01490      * each disk read/write operation.  This is for debugging.
01491      * It is useful in discovering thread sync bugs.
01492      * This delay applies to all threads.
01493     */
01494     static rc_t            set_disk_delay(u_int milli_sec);
01495 
01496     /**\cond skip */
01497     // TODO : document crash testing facilities
01498     /**\brief Simulate a crash
01499      * \details
01500      * This method tells the log manager to start generating corrupted
01501      * log records.  This will make it appear that a crash occurred
01502      * at that point in the log.  A call to this method should be
01503      * followed immediately by a dirty shutdown of the ssm.
01504      */
01505     static rc_t            start_log_corruption();
01506     /**\endcond skip */
01507 
01508     // Forces a log flush
01509     static rc_t             sync_log(bool block=true);
01510     static rc_t             flush_until(lsn_t& anlsn, bool block=true);
01511 
01512     // Allowing to access info about the important lsns (curr and durable)
01513     static rc_t            get_curr_lsn(lsn_t& anlsn);
01514     static rc_t            get_durable_lsn(lsn_t& anlsn);
01515 
01516 
01517     /*
01518        Device and Volume Management
01519        ----------------------------
01520        A device is either an operating system file or operating system
01521        device and is identified by a path name (absolute or relative).
01522        A device has a quota.  In theory, a device may have 
01523        multiple volumes on it but
01524        in the current implementation the maximum number of volumes
01525        is 1.
01526 
01527        A volume is where data is stored.  A volume is identified
01528        uniquely and persistently by a long volume ID (lvid_t).
01529        Volumes can be used whenever the device they are located
01530        on is mounted by the SM.  Volumes have a quota.  The
01531        sum of the quotas of all the volumes on a device cannot
01532        exceed the device quota.
01533 
01534        The basic steps to begin using a new device/volume are:
01535         format_dev: initialize the device
01536         mount_dev: allow use of the device and all its volumes
01537         generate_new_lvid: generate a unique ID for the volume
01538         create_vol: create a volume on the device
01539      */
01540 
01541     /*
01542      * Device management functions
01543      */
01544      /**\addtogroup SSMVOL 
01545       * The storage manager was designed to permit multiple \e volumes
01546       * on a \e device, with \e volume analogous to a Unix \e parition and
01547       * a \e device analogous to a disk, and the original SHORE contained
01548       * symmetric peer servers.  
01549       * However good that intention, multiple volumes on a device were never
01550       * implemented, and times have changed, and the storage manager no
01551       * longer has any notion of remote and local volumes.
01552       * The notion a volume, separate from a device, remains, but may
01553       * some day disappear.
01554       *
01555       * For the time being, a device contains at most one volume. 
01556       *
01557      * A device is either an operating system file or 
01558      * an operating system device (e.g., raw disk partition) and  
01559      * is identified by a path name (absolute or relative).
01560      *
01561      * A device has a quota.  
01562      * A device is intended to have multiple volumes on it, but
01563      * in the current implementation the maximum number of volumes
01564      * is exactly 1.
01565      *
01566      * A volume is where data are stored.  
01567      * Each volume is a header and a set of pages. All pages are
01568      * the same size (this is a compile-time constant, the default being
01569      * 8K and sizes up to 64K permissible).
01570      *
01571      * A volume is identified uniquely and persistently by a 
01572      * long volume ID (lvid_t), which is stored in its header.
01573      * Volumes can be used whenever the device they are located
01574      * on is mounted by the SM.  
01575      * Volumes have a quota.  The
01576      * sum of the quotas of all the volumes on a device cannot
01577      * exceed the device quota.
01578      *
01579      * A volume contains a variety of data structures. All user
01580      * data reside in \e stores.  A store is a collection of the
01581      * pages on the volume, allocated in \e extents of a size that
01582      * is a compile-time constant. (The storage manager has only
01583      * been tested with an extent-size of 8 pages. The compile-time constant
01584      * can be changed, but it also requires changes elsewhere in the code
01585      * to maintain alignment of persistent structures.
01586      * See the comments in config/shore.def.) Thus, the minimum size
01587      * of a store is one extent's worth of pages.
01588      * Larger extents provide better clustering, but more wasted space if
01589      * small files and small indexes will be common.
01590      *
01591      * Stores are identified by a store number (snum_t).
01592      *
01593      * Each volume contains a few stores that are "overhead":
01594      * 0 -- is reserved for an extent map and a store map
01595      * 1 -- directory (dir_m)
01596      * 2 -- root index 
01597      *
01598      * Beyond that, for each (user) file created, 2 stores are used, one for
01599      * small objects, one for large objects, and for each index (btree, rtree) 
01600      * created 1 store is used.
01601      *
01602      * Each volume is laid out thus:
01603      * - volume header, which identifies the number of extents on
01604      *   the volume, determined when the volume is formatted.
01605      *   This is always in page 1 of the volume.
01606      * - store map: some number of pages describing the stores on the volume,
01607      *   namely, being the heads of linked-lists of extents that make up
01608      *   the stores. The number of such pages is determined when the
01609      *   volume is formatted.  The worst case is assumed, which is one
01610      *   might fill the volume with one-extent stores.
01611      * - extent map: some number of pages of bitmaps, one bitmap for each 
01612      *   extent,  describe which pages in the extents are allocated or free.
01613      * - data pages: the rest of the volume.
01614      *
01615      */
01616 
01617     /**\brief Format a device.
01618      * \ingroup SSMVOL
01619      * \details
01620      * @param[in] device   Operating-system file name of the "device".
01621      * @param[in] quota_in_KB  Quota in kilobytes.
01622      * @param[in] force If true, format the device even if it already exists.
01623      *
01624      * Since raw devices always "exist", \a force should be given as true 
01625      * for raw devices.
01626      *
01627      * A device may not be formatted if it is already mounted.
01628      *
01629      * \note This method should \b not 
01630      * be called in the context of a transaction.
01631      */
01632     static rc_t            format_dev(
01633         const char*            device,
01634         smksize_t              quota_in_KB,
01635         bool                   force);
01636     
01637     /**\brief Mount a device.
01638      * \ingroup SSMVOL
01639      * \details
01640      * @param[in] device   Operating-system file name of the "device".
01641      * @param[out] vol_cnt Number of volumes on the device.
01642      * @param[out] devid  A local device id assigned by the storage manager.
01643      * @param[in] local_vid A local handle to the (only) volume on the device,
01644      * to be used when a volume is mounted.  The default, vid_t::null, 
01645      * indicates that the storage manager can chose a value for this. 
01646      *
01647      * \note It is fine to mount a device more than once, as long as device
01648      * is always the same (you cannot specify a hard link or soft link to
01649      * an entity mounted under a different path). 
01650      * Device mounts are \b not reference-counted, so a single dismount_dev
01651      * renders the volumes on the device unusable.
01652      *
01653      * \note This method should \b not 
01654      * be called in the context of a transaction.
01655      */
01656     static rc_t            mount_dev(
01657         const char*            device,
01658         u_int&                 vol_cnt,
01659         devid_t&               devid,
01660         vid_t                  local_vid = vid_t::null);
01661 
01662     /**\brief Dismount a device.
01663      * \ingroup SSMVOL
01664      * \details
01665      * @param[in] device   Operating-system file name of the "device".
01666      *
01667      * \note It is fine to mount a device more than once, as long as device
01668      * is always the same (you cannot specify a hard link or soft link to
01669      * an entity mounted under a different path). 
01670      * Device mounts are \b not reference-counted, so a single dismount_dev
01671      * renders the volumes on the device unusable.
01672      *
01673      * \note This method should \b not 
01674      * be called in the context of a transaction.
01675      */
01676 
01677     static rc_t            dismount_dev(const char* device);
01678 
01679     /**\brief Dismount all mounted devices.
01680      * \ingroup SSMVOL
01681      *
01682      * \note This method should \b not 
01683      * be called in the context of a transaction.
01684      */
01685     static rc_t            dismount_all();
01686 
01687     // list_devices returns an array of char* pointers to the names of
01688     // all mounted devices.  Note that the use of a char*'s is 
01689     // a temporary hack until a standard string class is available.
01690     // the char* pointers are pointing directly into the device
01691     // mount table.
01692     // dev_cnt is the length of the list returned.
01693     // dev_list and devid_list must be deleted with delete [] by the
01694     // caller if they are not null (0).  They should be null
01695     // if an error is returned or if there are no devices.
01696     /**\brief Return a list of all mounted devices.
01697      * \ingroup SSMVOL
01698      * \details
01699      * @param[out] dev_list   Returned list of pointers directly into the mount table.
01700      * @param[out] devid_list   Returned list of associated device ids.
01701      * @param[out] dev_cnt   Returned number of entries in the two above lists.
01702      *
01703      * The storage manager allocates the arrays returned with new[], and the
01704      * caller must return these to the heap with delete[] if they are not null.
01705      * They will be null if an error is returned or if no devices are mounted.
01706      *
01707      * The strings to which dev_list[*] point are \b not to be deleted by
01708      * the caller.
01709      */
01710     static rc_t            list_devices(
01711         const char**&            dev_list, 
01712         devid_t*&                devid_list, 
01713         u_int&                   dev_cnt);
01714 
01715     /**\brief Return a list of all volume on a device.
01716      * \ingroup SSMVOL
01717      * \details
01718      * @param[in] device   Operating-system file name of the "device".
01719      * @param[out] lvid_list   Returned list of pointers directly into the mount table.
01720      * @param[out] lvid_cnt   Returned length of list lvid_list.
01721      *
01722      * The storage manager allocates the array lvid_list 
01723      * with new[], and the
01724      * caller must return it to the heap with delete[] if it is not null.
01725      * It will be null if an error is returned. 
01726      *
01727      * \note This method should \b not 
01728      * be called in the context of a transaction.
01729      */
01730     static rc_t            list_volumes(
01731         const char*            device,
01732         lvid_t*&               lvid_list,
01733         u_int&                 lvid_cnt
01734     );
01735 
01736     // get_device_quota the "quota" (in KB) of the device
01737     // and the amount of the quota allocated to volumes on the device.
01738     /**\brief Get the device quota.
01739      * \ingroup SSMVOL
01740      * \details
01741      * @param[in] device   Operating-system file name of the "device".
01742      * @param[out] quota_KB   Returned quota in kilobytes
01743      * @param[out] quota_used_KB   Returned portion of quota allocated to volumes
01744      *
01745      * The quota_used_KB is the portion of the quota allocated to volumes on the device.
01746      *
01747      * \note This method \b may 
01748      * be called in the context of a transaction.
01749      *
01750      * \note This method \b may 
01751      * be called in the context of a transaction.
01752      */
01753     static rc_t            get_device_quota(
01754         const char*             device, 
01755         smksize_t&              quota_KB, 
01756         smksize_t&              quota_used_KB);
01757 
01758 
01759     /*
01760      * Volume management functions
01761      */
01762 
01763     /**\brief Change the fake disk latency before I/Os on this volume, 
01764      * for debugging purposes
01765      * \ingroup SSMVOL
01766      * \details
01767      * @param[in] vid  The ID of the volume of interest.
01768      * @param[in] adelay  Nanoseconds to sleep with ::nanosleep()
01769      *
01770      * This is for debugging only.
01771      * Changing the value of the latency for a volume does not enable the
01772      * delay.
01773      */
01774     static rc_t set_fake_disk_latency(vid_t vid, const int adelay);
01775 
01776     /**\brief Enable the fake disk latency before I/Os on this volume, for debugging purposes
01777      * \ingroup SSMVOL
01778      * \details
01779      * @param[in] vid  The ID of the volume of interest.
01780      *
01781      * This is for debugging only.
01782      * When this is enabled, is uses whatever disk latency was set with
01783      * ss_m::create_vol() or the last applied ss_m::set_fake_disk_latency().
01784      */
01785     static rc_t enable_fake_disk_latency(vid_t vid);
01786     /**\brief Disable the fake disk latency before I/Os on this volume, for debugging purposes
01787      * \ingroup SSMVOL
01788      * \details
01789      * @param[in] vid  The ID of the volume of interest.
01790      *
01791      * This is for debugging only.
01792      */
01793     static rc_t disable_fake_disk_latency(vid_t vid);
01794 
01795 
01796     /**\brief Add a volume to a device.
01797      * \ingroup SSMVOL
01798      * \details
01799      * @param[in] lvid  Long volume id to be used on ss_m::create_vol().
01800      *
01801      * This generates a unique volume identifier to be written persistently
01802      * on the volume when it is formatted.
01803      * This enables us to avoid the mistake of doubly-mounting a volume.
01804      * The identifer is constructed from the machine network address and the
01805      * time of day.
01806      */
01807     static rc_t generate_new_lvid(lvid_t& lvid);
01808      
01809     /**\brief Add a volume to a device.
01810      * \ingroup SSMVOL
01811      * \details
01812      * @param[in] device_name   Operating-system file name of the "device".
01813      * @param[in] lvid  Long volume id to use when formatting the new volume.
01814      * @param[in] quota_KB  Quota in kilobytes.
01815      * @param[in] skip_raw_init  Do not initialize the volume if on a raw device.
01816      * @param[in] local_vid Short volume id by which to refer to this volume.
01817      *            If null, the storage manager will assign one.
01818      * @param[in] apply_fake_io_latency See ss_m::enable_fake_disk_latency()
01819      * @param[in] fake_disk_latency See ss_m::set_fake_disk_latency()
01820      *
01821      * \note This method should \b not 
01822      * be called in the context of a transaction.
01823      *
01824      * The pages on the volume \b must be zeroed; you can only use
01825      * \a skip_raw_init = true if you have by some other means
01826      * already initialized the volume.
01827      */
01828     static rc_t            create_vol(
01829         const char*             device_name,
01830         const lvid_t&           lvid,
01831         smksize_t               quota_KB,
01832         bool                    skip_raw_init = false,
01833         vid_t                   local_vid = vid_t::null,
01834         const bool              apply_fake_io_latency = false,
01835         const int               fake_disk_latency = 0);
01836 
01837     /**\brief Destroy a volume.
01838      * \ingroup SSMVOL
01839      * \details
01840      * @param[in] lvid  Long volume id by which the volume is known.
01841      *
01842      * \note This method should \b not 
01843      * be called in the context of a transaction.
01844      */
01845     static rc_t            destroy_vol(const lvid_t& lvid);
01846 
01847     /**\brief Gets the quotas associated with the volume.
01848      * \ingroup SSMVOL
01849      * @param[in] lvid  Long volume id by which the volume is known.
01850      * @param[out] quota_KB  Quota given when the volume was created.
01851      * @param[out] quota_used_KB  Portion of the quota has been used by
01852      * allocated extents.
01853      */
01854     static rc_t            get_volume_quota(
01855         const lvid_t&             lvid, 
01856         smksize_t&                quota_KB, 
01857         smksize_t&                quota_used_KB);
01858 
01859     /**\cond skip */
01860     // check_volume_page_types: strictly for debugging/testing
01861     static rc_t             check_volume_page_types(vid_t vid);
01862     /**\endcond skip */
01863 
01864 
01865     /**\brief Analyze a volume and report statistics regarding disk usage.
01866      * \ingroup SSMVOL
01867      * @param[in] vid The volume of interest.
01868      * @param[out] du The structure that will hold the collected statistics.
01869      * @param[in] audit If "true", the method acquires a share lock on the
01870      * volume and then will check assertions about the
01871      * correctness of the data structures on the volume. 
01872      * If the audit fails an internal fatal error is generated 
01873      * to facilitate debugging. (It will generate a core file if your
01874      * shell permits such.)
01875      * If "false" an IS lock is acquired, which means that the
01876      * statistics will be fuzzy.
01877      *
01878      * Using the audit feature is useful for debugging.
01879      * It is the only safe way to use this method.
01880      * \note The statistics are added to the sm_du_stats_t structure passed in.
01881      * This structure is not cleared by the storage manager.
01882      */
01883     static rc_t            get_du_statistics(
01884         vid_t                 vid,
01885         sm_du_stats_t&        du,
01886         bool                  audit = true); 
01887 
01888     /**\brief Analyze a store and report statistics regarding disk usage.
01889      * \ingroup SSMVOL
01890      * @param[in] stid The store of interest.
01891      * @param[out] du The structure that will hold the collected statistics.
01892      * @param[in] audit If "true", the method acquires a share lock on the
01893      * store and then will check assertions about the
01894      * correctness of the data structures on the store. 
01895      *
01896      * Using the audit feature is useful for debugging.
01897      * It is the only safe way to use this method.
01898      *
01899      */
01900     static rc_t            get_du_statistics(
01901         const stid_t&        stid, 
01902         sm_du_stats_t&       du,
01903         bool                 audit = true);
01904     
01905 
01906     /**\brief Analyze  a volume and collect brief statistics about its usage.
01907      * \ingroup SSMVOL
01908      * @param[in] vid The volume of interest.
01909      * @param[out] volume_stats The statistics are written here.
01910      * @param[in] cc Indicates whether the volume is to be locked 
01911      * by this method. Acceptable values are t_cc_none and t_cc_volume.
01912      *
01913      * If no lock is acquired, the method can fail with eRETRY.
01914      *
01915      */
01916     static rc_t            get_volume_meta_stats(
01917         vid_t                vid,
01918         SmVolumeMetaStats&   volume_stats,
01919         concurrency_t        cc = t_cc_none
01920     );
01921 
01922     /**\brief Analyze  a volume and collect brief statistics about its usage.
01923      * \ingroup SSMVOL
01924      * @param[in] vid The volume of interest.
01925      * @param[in] num_files The size of the array file_stats.
01926      * @param[out] file_stats Preallocated array of structs into which to
01927      * write the statistics for the individual files inspected.
01928      * @param[in] batch_calculate  True means make one pass over the volume.
01929      * @param[in] cc Indicates whether the volume is to be locked 
01930      * by this method. Acceptable values are t_cc_none and t_cc_volume.
01931      *
01932      * If no lock is acquired and batch_calculate is not set, 
01933      * the method can fail with eRETRY.
01934      *
01935      *
01936      * If batch_calculate is true then this works by making one pass
01937      * over the meta data, but it looks at all the meta data.  This
01938      * should be the faster way to do the analysis when there are 
01939      * many files, and when files use a large portion of the volume.
01940      *
01941      * If batch_calculate is false then each file is updated
01942      * indidually, only looking at the extent information for that
01943      * particular file. This requires a pass over the volume for each
01944      * file. (Seek-wise it is less efficient).
01945      *
01946      */
01947     static rc_t            get_file_meta_stats(
01948         vid_t                vid,
01949         w_base_t::uint4_t    num_files,
01950         SmFileMetaStats*     file_stats,
01951         bool                 batch_calculate = false,
01952         concurrency_t        cc = t_cc_none
01953     );
01954    
01955     /**\brief Get the index ID of the root index of the volume.
01956      * \ingroup SSMVOL
01957      *
01958      * @param[in] v Volume of interest.
01959      * @param[out] iid Store ID of the root index.
01960      * \details
01961      *
01962      * Each volume has a root index, which is a well-known
01963      * index available to the server for bootstrapping a database.
01964      *
01965      */
01966     static rc_t            vol_root_index(
01967         const vid_t&        v, 
01968         stid_t&             iid
01969     )    { iid.vol = v; iid.store = store_id_root_index; return RCOK; }
01970 
01971     /*****************************************************************
01972      * storage operations: smfile.cpp
01973      *****************************************************************/
01974     /**\addtogroup SSMSTORE 
01975      * Indexes and files are special cases of "stores".
01976      * A store is a linked list of extents, and an extent is a
01977      * contiguous group of pages.  So the store is the structure
01978      * that holds together an ordered set of pages that can be
01979      * used by a server and have an identifier (a store ID or stid_t).
01980      *
01981      * Indexes and files of records are built on stores.
01982      *
01983      * Stores have logging properties and 
01984      * other metadata associated with them.
01985      * 
01986      * The property that determines the logging level of the store is
01987      * \ref sm_store_property_t.
01988      *
01989      * Methods that let you get and change the metatdata are:
01990      * - ss_m::get_store_property
01991      * - ss_m::set_store_property
01992      * - ss_m::get_store_info
01993      * - \ref snum_t
01994      *
01995      * When a transaction deletes a file or index, the deletion of the
01996      * underlying stores is delayed until the transaction commits so that
01997      * the pages allocated to the stores remain reserved (lest the
01998      * transaction aborts). The deleting transaction could, in theory,
01999      * reuse the pages for another store, but in practice that is not done.
02000      * Instead, when a store is deleted, the store is marked
02001      * for deletion an put in a list for the transaction to delete upon
02002      * commit.   At commit time, stores that have property t_load_file
02003      * or t_insert_file are converted to t_regular.
02004      */
02005 
02006     /**\brief Change the store property of a file or index.
02007      * \ingroup SSMSTORE
02008      * @param[in] stid   File ID or index ID of the store to change.
02009      * @param[in] property   Enumeration store_property_t (alias for
02010      *                   smlevel_3::sm_store_property_t, q.v.)
02011      *
02012      * \details
02013      * The possible uses of store properties are described with 
02014      * smlevel_3::sm_store_property_t.
02015      */
02016     static rc_t            set_store_property(
02017         stid_t                stid,
02018         store_property_t      property
02019         );
02020 
02021     /**\brief Get the store property of a file or index.
02022      * \ingroup SSMSTORE
02023      * @param[in] stid   File ID or index ID of the store of interest.
02024      * @param[in] property   Reference to enumeration store_property_t 
02025      *                  (alias for smlevel_3::sm_store_property_t, q.v.)
02026      *
02027      * \details
02028      * The possible uses of store properties are described with 
02029      * smlevel_3::sm_store_property_t.
02030      */
02031     static rc_t            get_store_property(
02032         stid_t                stid,
02033         store_property_t&     property);
02034 
02035     /**\brief Get various store information of a file or index.
02036      * \ingroup SSMSTORE
02037      * @param[in] stid   File ID or index ID of the store of interest.
02038      * @param[out] info  Reference to sm_store_info_t into which to
02039      * write the results.
02040      *
02041      * \details
02042      * Get internally stored information about a store.
02043      */
02044     static rc_t            get_store_info( 
02045         const stid_t&         stid, 
02046         sm_store_info_t&      info);
02047 
02048     //
02049     // Functions for B+tree Indexes
02050     //
02051     /**\addtogroup SSMBTREE 
02052      * The storage manager supports B+-Tree indexes provide associative access 
02053      * to data by associating keys with values in 1:1 or many:1 relationships.
02054      * Keys may be composed of any of the basic C-language types (integer,
02055      * unsigned, floating-point of several sizes) or
02056      * variable-length character strings (wide characters are \b not supported).
02057      *
02058      * The number of key-value pairs that an index can hold is limited by the
02059      * space available on the volume containing the index.
02060      * \anchor max_entry_size 
02061      * The combined sizes of the key and value must
02062      * be less than or equal to \ref max_entry_size, which is
02063      * a function of the page size, and is 
02064      * such that two entries of this size fit on a page along with all
02065      * the page and entry metadata.  See sm_config_info_t and ss_m::config_info.
02066      *
02067      * The minimum size of a B-Tree index is 8 pages (1 extent).
02068      *
02069      * A variety of locking protocols is supported:
02070      * - none : acquire no locks on the {key,value} pairs in the index,
02071      *   although an intention lock might be acquired on the index.
02072      * - kvl : key-value locking See \ref MOH1.  The key or
02073      *   key-value pair is hashed into a 4-byte value and used with the
02074      *   given store id to make a lock id.
02075      * - im : index-management locking See \ref MOH1.  
02076      *   The "value" portion of
02077      *   the key-value lock is taken to be a record id, which is used 
02078      *   for the lock id.
02079      * - modified kvl : an ad-hoc protocol used by the Paradise project. See \ref MODKVL "the scan_index_i constructor". As with index-management locking, 
02080      *   the "value" portion of
02081      *   the key-value lock is taken to be a record id, which is used 
02082      *   for the lock id.
02083      * - file : full-index locking.
02084      *
02085      * \section key_description Key Types
02086      * A B+-Tree index key has a type determined when the index is created.
02087      * All keys are stored in lexicographic format based on an interpretation of
02088      * the key determined by the key description given when the index is
02089      * created.
02090      * Lookups on the B+-Tree then involve a single byte-by-byte
02091      * comparison of two byte-strings, each composed of its concatenated
02092      * sub-keys.
02093      *
02094      * The key description is a null-terminated string as follows:
02095      \verbatim
02096      <key_decription>     ::=  <fixed_len_part>*  <variable_len_part>  |
02097                                <fixed_len_part>+ 
02098      <fixed_len_part>     ::=  <type> <len> 
02099      <variable_len_part>  ::=  <type> '*' <len>
02100      <type>               ::=  'i' | 'u' | 'f' | 'b' | 'I' | 'U' | 'F' | 'B'
02101      <len>                ::=   [1-9][0-9]*
02102      \endverbatim
02103      * Thus, a key may have any number of fixed-length parts followed by at
02104      * most one variable-length part.
02105      *
02106      * The fixed-length parts (if present) consist of a type and a length.
02107      *
02108      * The variable-length part (if present) consists of a type and a length
02109      * separated by an asterisk, which is what distinguishes a variable-length
02110      * from a fixed-length part.
02111      *
02112      * Types and permissible lengths are:
02113      * - integer (1,2,4,8)
02114      * - unsigned (1,2,4,8)
02115      * - floating (4,8)
02116      * - uninterpreted byte (any length greater than zero)
02117      *
02118      * A capital letter indicates that the key part may be compressed. Only prefix
02119      * compression is implemented, so it makes sense to compress if the
02120      * first part of the key is compressible.
02121      *
02122      * Examples:
02123      * - "B40u4u2u2" : 40-byte character string followed by a 4-byte integer,
02124      *                 a 2-byte integer and a 2-byte integer, such as one might
02125      *                 use for name.year.mo.day.  The character string is
02126      *                 prefix-compressed.
02127      * - "f8"        : an 8-byte floating-point number (double)
02128      * - "I8B*1000"  : An 8-byte integer followed by an uninterpreted string
02129      *                 of up to 1000 bytes, all prefix-compressed.
02130      *
02131      * \note Wide characters are not supported.
02132      *
02133      * This key descriptor is stored in the sm_store_info_t, which is
02134      * stored on the volume and is available with the method ss_m::get_store_info.
02135      * Keys are stored in \ref LEXICOFORMAT "lexicographic format". The
02136      * storage manager knows how to convert all the key types listed above.
02137      * When duplicates are permitted, the index assumes that the elements
02138      * are in lexicographic order when searching for a <key,element> pair.
02139      *
02140      * \section XXXX1 Bulk Loading 
02141      * Bulk-loading of all index types is supported. See \ref SSMBULKLD.
02142      */
02143 
02144 
02145     /**\brief Create a B+-Tree index.
02146      * \ingroup SSMBTREE
02147      * @param[in] vid   Volume on which to create the index.
02148      * @param[in] ntype   Type of index. Legitimate values are: 
02149      *  - t_btree : B+-Tree with duplicate keys allowed
02150      *  - t_uni_btree : B+-Tree without duplicate keys 
02151      * @param[in] property Logging level of store. Legitimate values are:
02152      *  - t_regular
02153      *  - t_load_file
02154      *  - t_insert_file
02155      *  See sm_store_property_t for details.
02156      * @param[in] key_desc Description of key type.
02157      *  See \ref key_description for details.
02158      * @param[in] cc The locking protocol to use with this index. See
02159      * smlevel_0::concurrency_t and \ref SSMBTREE.
02160      * @param[out] stid New store ID will be returned here.
02161      */
02162     static rc_t            create_index(
02163                 vid_t                 vid, 
02164                 ndx_t                 ntype, 
02165                 store_property_t      property,
02166                 const char*           key_desc,
02167                 concurrency_t         cc, 
02168                 stid_t&               stid
02169     );
02170 
02171     /**\brief Create a B+-Tree or R*-Tree index.
02172      * \ingroup SSMBTREE
02173      *\attention For backward compatibility. Will be deprecated later.
02174      */
02175     static rc_t            create_index(
02176                 vid_t                 vid, 
02177                 ndx_t                 ntype, 
02178                 store_property_t      property,
02179                 const char*           key_desc,
02180                 stid_t&               stid
02181     );
02182 
02183     /**\brief Destroy a B+-Tree index.
02184      * \ingroup SSMBTREE
02185      *
02186      * @param[in] iid  ID of the index to be destroyed.
02187      */
02188     static rc_t            destroy_index(const stid_t& iid); 
02189 
02190     /**\brief Bulk-load a B+-Tree index from multiple data sources.
02191      * \ingroup SSMBULKLD
02192      *
02193      * @param[in] stid  ID of the index to be loaded.
02194      * @param[in] nsrcs  Number of files used for data sources.
02195      * @param[in] source  Array of IDs of files used for data sources.
02196      * @param[out] stats  Statistics concerning the load activity will be
02197      *                     written here.
02198      * @param[in] sort_duplicates  If "true" the bulk-load will sort
02199      * duplicates by value.
02200      * @param[in] lexify_keys  If "true" the keys are assumed not to
02201      * be in 
02202      * lexicographic format, and the bulk-load will reformat the key before
02203      * storing it in the index,
02204      * otherwise they are assumed already to be in lexicographic format.
02205      *
02206      * \anchor LEXICOFORMAT 
02207      * \b Lexicographic \b format
02208      * is the translation of numbers 
02209      * (int, float, double, unsigned, etc) into byte strings
02210      * such that a lexicographic comparison of the byte strings
02211      * yields the same result as the numeric comparison of the
02212      * original data.
02213      *
02214      * \note The data must already have been sorted by 
02215      * key in lexicographic format, but the keys themselves don't have
02216      * to be in lexicographic format; if the keys are not already in
02217      * lexicographic format, the \a lexify_keys must be given the value "true".
02218      *
02219      * In the case of duplicate keys, the bulk-load will handle the
02220      * sorting of the elements if \a sort_duplicates is "true"; this
02221      * sort will be done by a lexicographic comparison of the 
02222      * byte strings that compose the elements.
02223      */
02224     static rc_t            bulkld_index(
02225         const stid_t&             stid, 
02226         int                       nsrcs,
02227         const stid_t*             source,
02228         sm_du_stats_t&            stats,
02229         bool                      sort_duplicates = true,
02230         bool                      lexify_keys = true
02231     );
02232     /**\brief Bulk-load a B+-Tree index from a single data source.
02233      * \ingroup SSMBULKLD
02234      *
02235      * @param[in] stid  ID of the index to be loaded.
02236      * @param[in] source  IDs of file used for data source.
02237      * @param[out] stats  Statistics concerning the load activity will be
02238      *                     written here.
02239      * @param[in] sort_duplicates  If "true" the bulk-load will sort
02240      * duplicates by value.
02241      * @param[in] lexify_keys  If "true" the keys are assumed not to
02242      * be in 
02243      * lexicographic format, and the bulk-load will reformat the key before
02244      * storing it in the index,
02245      * otherwise they are assumed already to be in lexicographic format.
02246      */
02247     static rc_t            bulkld_index(
02248         const stid_t&             stid, 
02249         const stid_t&             source,
02250         sm_du_stats_t&            stats,
02251         bool                      sort_duplicates = true,
02252         bool                      lexify_keys = true
02253     );
02254     /**\brief Bulk-load a B+-Tree index from a single data stream.
02255      * \ingroup SSMBULKLD
02256      *
02257      * @param[in] stid  ID of the index to be loaded.
02258      * @param[in] sorted_stream  Iterator that serves as the data source.
02259      * @param[out] stats  Statistics concerning the load activity will be
02260      *                     written here.
02261      *
02262      * See sort_stream_i.
02263      */
02264     static rc_t            bulkld_index(
02265         const stid_t&             stid, 
02266         sort_stream_i&            sorted_stream,
02267         sm_du_stats_t&            stats);
02268 
02269     /**\cond skip */
02270     static rc_t            print_index(stid_t stid);
02271     /**\endcond skip */
02272 
02273     /**\brief Create an entry in a B+-Tree index.
02274      * \ingroup SSMBTREE
02275      *
02276      * @param[in] stid  ID of the index. 
02277      * @param[in] key  Key for the association to be created.
02278      * @param[in] el  Element for the association to be created.
02279      *
02280      * The combined sizes of the key and element vectors must
02281      * be less than or equal to \ref max_entry_size.
02282      */
02283     static rc_t            create_assoc(
02284         stid_t                   stid, 
02285         const vec_t&             key, 
02286         const vec_t&             el
02287 #ifdef SM_DORA
02288         , const bool             bIgnoreLocks = false
02289 #endif
02290     );
02291     /**\brief Remove an entry from a B+-Tree index.
02292      * \ingroup SSMBTREE
02293      *
02294      * @param[in] stid  ID of the index. 
02295      * @param[in] key   Key of the entry to be removed.
02296      * @param[in] el   Element (value) of the entry to be removed.
02297      */
02298     static rc_t            destroy_assoc(
02299         stid_t                   stid, 
02300         const vec_t&             key,
02301         const vec_t&             el
02302 #ifdef SM_DORA
02303         , const bool             bIgnoreLocks = false
02304 #endif
02305     );
02306     /**\brief Destroy all entries associated with a key in a B+-Tree index. 
02307      * \ingroup SSMBTREE
02308      *
02309      * @param[in] stid  ID of the index. 
02310      * @param[in] key   Key of the entries to be removed.
02311      * @param[out] num_removed   The number of entries removed is returned here.
02312      */
02313     static rc_t            destroy_all_assoc(
02314         stid_t                  stid, 
02315         const vec_t&            key,
02316         int&                    num_removed
02317     );
02318     /**\brief Find an entry associated with a key in a B+-Tree index. 
02319      * \ingroup SSMBTREE
02320      *
02321      * @param[in] stid  ID of the index. 
02322      * @param[in] key   Key of the entries to be removed.
02323      * @param[out] el   Element associated with the given key will be copied into this buffer.
02324      * @param[in] elen Length of buffer into which the 
02325      *                  result will be written. If too small, eRECWONTFIT will
02326      *                  be returned.
02327      *                 Length of result will be returned here.
02328      * @param[out] found   True if an entry is found.
02329      *
02330      * If the index is not unique (allows duplicates), the first
02331      * element found with the given key will be returned.
02332      *
02333      * To locate all entries associated with a non-unique key, you must
02334      * use scan_index_i, q.v.. 
02335      */
02336     static rc_t            find_assoc(
02337         stid_t                  stid, 
02338         const vec_t&            key, 
02339         void*                   el, 
02340         smsize_t&               elen, 
02341         bool&                   found
02342 #ifdef SM_DORA
02343         , const bool             bIgnoreLocks = false
02344 #endif
02345     );
02346 
02347     //
02348     // Functions for R*tree (multi-dimensional(MD), spatial) Indexes
02349     //
02350 
02351     /**\addtogroup SSMRTREE 
02352      *
02353      * An R-tree is a height-balanced structure designed for indexing
02354      * multi-dimensional spatial objects.  
02355      * It stores the minimial bounding box (with 2 or higher dimension) of 
02356      * a spatial object as the key in the leaf pages.
02357      * This implementation is a variant of an R-Tree called an R*-Tree, which
02358      * improves the search performance by using a heuristic for redistributing
02359      * entries and dynamically reorganizing the tree during insertion.
02360      *
02361      * An R*-Tree stores key,value pairs where the key is of type nbox_t
02362      * and the value is of type vec_t.
02363      *
02364      * The number of key-value pairs an index can hold is limited by the space
02365      * available on the volume containing the index.
02366      * The minimum size of an R*-tree index is 8 pages.
02367      *
02368      * 
02369      * \note This implementation 
02370      * uses coarse-grained (index-level) locking and 
02371      * supports only 2 dimensions and integer coordinates.
02372      * For information about R*-trees, see the \ref BKSS.
02373      *
02374      * Example:
02375      * \code
02376      scan_rt_i scan(idx, nbox_t::t_overlap, universe, true);
02377      bool      eof;
02378      nbox_t    k;
02379      char*     e;
02380      smsize_t  elen;
02381 
02382      for(int i=0; 
02383              (!(rc = scanp->next(k,e,elen,eof)).is_error() && !eof);
02384              i++) ;
02385      cout << "Rtree " << idx << " contains " << i << " entries." << endl;
02386      \endcode
02387      * 
02388      *
02389      * \section XXXX2 Bulk Loading 
02390      * Bulk-loading of all index types is supported. See \ref SSMBULKLD.
02391      */
02392      /*\example rtree_example.cpp*/
02393 
02394 
02395     /**\brief Create an R*-Tree (multi-dimensional spatial) index.
02396      * \ingroup SSMRTREE
02397      * @param[in] vid   Volume on which to create the index.
02398      * @param[in] ntype   Type of index. Legitimate values are: 
02399      *  - t_rtree : R*-Tree 
02400      * @param[in] property Logging level of store. Legitimate values are:
02401      *  - t_temporary
02402      *  - t_regular
02403      *  - t_load_file
02404      *  - t_insert_file
02405      *  See sm_store_property_t for details.
02406      * @param[in] dim Number of dimensions of the key.
02407      * They key type is an nbox_t.
02408      * See \ref nbox_t for details. 
02409      * @param[out] stid New store ID will be returned here.
02410      */
02411     static rc_t            create_md_index(
02412         vid_t                   vid, 
02413         ndx_t                   ntype, 
02414         store_property_t        property,
02415         stid_t&                 stid, 
02416         int2_t                  dim = 2
02417     );
02418 
02419     /**\brief Destroy an R*-Tree index.
02420      * \ingroup SSMRTREE
02421      *
02422      * @param[in] iid  ID of the index to be destroyed.
02423      */
02424     static rc_t            destroy_md_index(const stid_t& iid);
02425 
02426     /**\brief Bulk-load a multi-dimensional index from multiple sources.
02427      * \ingroup SSMBULKLD
02428      * @param[in] stid  ID of the index to be loaded.
02429      * @param[in] nsrcs  Number of files used for data sources.
02430      * @param[in] source  Array of IDs of files used for data sources.
02431      * @param[out] stats  Statistics concerning the load activity will be
02432      *                     written here.
02433      * @param[in] hff   Heuristic fill factor. Not used.
02434      * @param[in] hef   Heuristic expansion factor. Not used.
02435      * @param[in] universe  Universal bounding box of all spatial objects indexed.
02436     */
02437     static rc_t            bulkld_md_index(
02438         const stid_t&             stid, 
02439         int                       nsrcs,
02440         const stid_t*             source, 
02441         sm_du_stats_t&            stats,
02442         int2_t                    hff=75,
02443         int2_t                    hef=120,
02444         nbox_t*                   universe=NULL);
02445 
02446     /**\brief Bulk-load a multi-dimensional index from a single source.
02447      * \ingroup SSMBULKLD
02448      * @param[in] stid  ID of the index to be loaded.
02449      * @param[in] source  ID of file to be used for data source.
02450      * @param[out] stats  Statistics concerning the load activity will be
02451      *                     written here.
02452      * @param[in] hff   Heuristic fill factor. Not used.
02453      * @param[in] hef   Heuristic expansion factor. Not used.
02454      * @param[in] universe  Universal bounding box of all spatial objects indexed.
02455     */
02456     static rc_t            bulkld_md_index(
02457         const stid_t&             stid, 
02458         const stid_t&             source, 
02459         sm_du_stats_t&            stats,
02460         int2_t                    hff=75,
02461         int2_t                    hef=120,
02462         nbox_t*                   universe=NULL);
02463 
02464     /**\brief Bulk-load a multi-dimensional index from a sorted stream source.
02465      * \ingroup SSMBULKLD
02466      * @param[in] stid  ID of the index to be loaded.
02467      * @param[in] sorted_stream  Input stream that is data source.
02468      * @param[out] stats  Statistics concerning the load activity will be
02469      *                     written here.
02470      * @param[in] hff   Heuristic fill factor. Not used.
02471      * @param[in] hef   Heuristic expansion factor. Not used.
02472      * @param[in] universe  Universal bounding box of all spatial objects indexed.
02473     */
02474     static rc_t            bulkld_md_index(
02475         const stid_t&             stid, 
02476         sort_stream_i&            sorted_stream,
02477         sm_du_stats_t&            stats,
02478         int2_t                    hff=75,
02479         int2_t                    hef=120,
02480         nbox_t*                   universe=NULL);
02481 
02482     static rc_t            print_md_index(stid_t stid);
02483 
02484     /**\brief Look up an entry in a multi-dimensional index.
02485      * \ingroup SSMRTREE
02486      *
02487      * @param[in] stid  ID of the index. 
02488      * @param[in] key   Key associated with the entry to look up.
02489      * @param[out] el   Element associated with the given key will be copied into this buffer.
02490      * @param[in] elen Length of buffer into which the 
02491      *                  result will be written. If too small, eRECWONTFIT will
02492      *                  be returned.
02493      *                 Length of result will be returned here.
02494      * @param[out] found   True if an entry is found.
02495      */
02496     static rc_t            find_md_assoc(
02497         stid_t                    stid, 
02498         const nbox_t&             key, 
02499         void*                     el, 
02500         smsize_t&                 elen, 
02501         bool&                     found);
02502 
02503     /**\brief Create an entry in a multi-dimensional index.
02504      * \ingroup SSMRTREE
02505      *
02506      * @param[in] stid  ID of the index. 
02507      * @param[in] key  Key for the association to be created.
02508      * @param[in] el  Element for the association to be created.
02509     */
02510     static rc_t            create_md_assoc(
02511         stid_t                    stid, 
02512         const nbox_t&             key,
02513         const vec_t&              el);
02514 
02515     /**\brief Destroy an entry in a multi-dimensional index.
02516      * \ingroup SSMRTREE
02517      *
02518      * @param[in] stid  ID of the index. 
02519      * @param[in] key   Key of the entry to be removed.
02520      * @param[in] el   Element (value) of the entry to be removed.
02521     */
02522     static rc_t            destroy_md_assoc(
02523         stid_t                    stid, 
02524         const nbox_t&             key,
02525         const vec_t&              el);
02526 
02527     /**\cond skip */
02528     // for debugging
02529     static rc_t            draw_rtree(const stid_t& stid, ostream &);
02530     /**\endcond skip */
02531 
02532     /**\brief Gather usage statistics about an R*-Tree index.
02533      * \ingroup SSMRTREE
02534      * @param[in] stid  ID of the index. 
02535      * @param[out] stat  Usage statistics will be written here.
02536      * @param[in] size  Number of uint2_t's in the array ovp.
02537      * @param[out] ovp   Pre-allocated array of integers into which
02538      * the method will write the overlap percentages for each level of the
02539      * tree.
02540      * @param[in] audit If "true", the method 
02541      * will check assertions about the
02542      * correctness of the rtree.
02543      * If the audit fails an internal fatal error is generated 
02544      * to facilitate debugging. (It will generate a core file if your
02545      * shell permits such.)
02546      *
02547      * \note for debugging
02548     */
02549     static rc_t            rtree_stats(
02550         const stid_t&             stid,
02551         rtree_stats_t&            stat,
02552         uint2_t                   size = 0,
02553         uint2_t*                  ovp = NULL,
02554         bool                      audit = false);
02555 
02556     /**\addtogroup SSMFILE 
02557      * You can create, destroy, and scan files of records. You may exert some
02558      * control over the order in which records appear in the file (a physical
02559      * scan), but, in general, the storage manager decides where to put records.
02560      *
02561      * Pages in a file are slotted pages: Each page contains an array of
02562      * slots.
02563      * Records take one of three forms: small, large, and very large.
02564      * - Small records fit in the slots on the file pages.
02565      * - Large records are too big to fit on a slotted page, so they are put
02566      * elsewhere, and the slots point to these records.  Actually, what is
02567      * in a slot is a small array of page pointers to the data of the large record.
02568      * - A very large record is one whose slot in the file page contains
02569      *   a single reference to a page that is an index of data pages.
02570      *
02571      * Because records may take these forms, the API for creating records
02572      * contains the opportunity for you to provide a hint about the ultimate
02573      * size of the record so that the storage manager can create the proper
02574      * structure for the record immediately, rather than creating a small
02575      * record that is soon to be converted to a large, then a very large record
02576      * by subsequent appends. 
02577      *
02578      * All records contain a client-defined header.  This is for the convenience
02579      * of server-writers.  The header must fit on the slotted page, so it should
02580      * never be very large.
02581      *
02582      * The following methods manipulate files of records and the records found 
02583      * there.
02584      *
02585      * Modules below describe file traversal and
02586      * appending to files (\ref SSMSCANF), 
02587      * and pinning individual records in the buffer pool for extended operations 
02588      * (\ref SSMPIN).
02589      *
02590      * \section UNINIT Uninitialized Data
02591      * The functions create_rec, append_rec, and update_rec can be used to
02592      * write blocks of data that are all zeroes,  with minimal logging. 
02593      * This is useful for creating records of known size but with uninitialized data.  
02594      * The type zvec_t, a special case of vec_t, is for this purpose. 
02595      * Construct it with only a size, as follows:
02596      * \code
02597      * zvec_t zdata(100000);
02598      * \endcode
02599      * The underlying logging code recognizes that this is a vector of zeroes and
02600      * logs only a count, not the data themselves. 
02601      *
02602      * \section Errors
02603      * If an error occurs in the middle of one of these methods that is updating persistent data,
02604      * the record or file \e could be in an inconsistent state. 
02605      * The caller has the choice of aborting the transaction or rolling back to the nearest savepoint (see \ref SSMXCT).
02606      *
02607      * \sa SSMSCAN, SSMPIN, vec_t, zvec_t, IDs.
02608      */
02609     
02610     /**\brief Create a file of records.
02611      * \ingroup SSMFILE
02612      * \details
02613      * @param[in] vid   Volume on which to create a file.
02614      * @param[out] fid  Returns (store) ID of the new file here.
02615      * @param[in] property Give the file the this property.
02616      * @param[in] cluster_hint Not used. 
02617      *
02618      * The cluster hint is included in the API for future use. 
02619      * It has no effect.
02620      */
02621     static rc_t            create_file( 
02622         vid_t                   vid, 
02623         stid_t&                 fid,
02624         store_property_t        property,
02625         shpid_t                 cluster_hint = 0
02626     ); 
02627 
02628     /**\brief Destroy a file of records.
02629      * \ingroup SSMFILE
02630      * \details
02631      * @param[in] fid  ID of the file to destroy.
02632      */
02633     static rc_t            destroy_file(const stid_t& fid); 
02634 
02635     /**\brief Create a new record.
02636      * \ingroup SSMFILE
02637      * \details
02638      * @param[in] fid  ID of the file in which to create a record.
02639      * @param[in] hdr  What to put in the record's header.
02640      * @param[in] len_hint  Hint about how big the record will ultimately be.
02641      * This is used to determine the initial format of the record. If you plan
02642      * to append to the record and know that it will ultimately become a large
02643      * record, it is more efficient to give a size hint that is larger than
02644      * a page here. Otherwise, the record will be made small (as determined by
02645      * the size of the parameter \a data ), and subsequent appends will cause 
02646      * the record to be converted to a large record.
02647      * @param[in] data  What to put in the record's body. 
02648      * @param[out] new_rid  ID of the newly created record.
02649      */
02650     static rc_t            create_rec(
02651         const stid_t&            fid, 
02652         const vec_t&             hdr, 
02653         smsize_t                 len_hint, 
02654         const vec_t&             data, 
02655         rid_t&                   new_rid
02656 #ifdef SM_DORA
02657         , const bool             bIgnoreLocks = false
02658 #endif
02659     ); 
02660 
02661     /**\brief Destroy a record.
02662      * \ingroup SSMFILE
02663      * \details
02664      * @param[in] rid  ID of the record to destroy.
02665      */
02666     static rc_t            destroy_rec(const rid_t& rid
02667 #ifdef SM_DORA
02668         , const bool             bIgnoreLocks = false
02669 #endif
02670                                        );
02671 
02672     /**\brief Modify the body of an existing record.
02673      * \ingroup SSMFILE
02674      * \details
02675      * @param[in] rid  ID of the record to modify.
02676      * @param[in] start  First byte to change.
02677      * @param[in] data  What to put in the record's body.  
02678      *
02679      * This overwrites
02680      * the existing bytes, starting at the offset \a start through the
02681      * byte at \a start + \a data.size().
02682      * This method \b cannot \b be \b used to change the size of a record.
02683      * Attempting this will result in an error.
02684      */
02685     static rc_t            update_rec(
02686         const rid_t&             rid, 
02687         smsize_t                 start, 
02688         const vec_t&             data);
02689 
02690     /**\brief Modify the header of an existing record.
02691      * \ingroup SSMFILE
02692      * \details
02693      * @param[in] rid  ID of the record to modify.
02694      * @param[in] start  First byte to change.
02695      * @param[in] hdr  What to put in the record's header.  
02696      *
02697      * This overwrites
02698      * the existing bytes, starting at the offset \a start through the
02699      * byte at \a start + \a data.size().
02700      * This method \b cannot \b be \b used to change the size of a record
02701      * header. There are no methods for appending to or truncating a
02702      * record header.
02703      *
02704      * \sa pin_i::update_rec, \ref SSMPIN
02705      */
02706     static rc_t            update_rec_hdr(
02707         const rid_t&             rid, 
02708         smsize_t                 start, 
02709         const vec_t&             hdr);
02710     // see also pin_i::update_rec*()
02711 
02712     /**\brief Append bytes to a record body.
02713      * \ingroup SSMFILE
02714      * \details
02715      * @param[in] rid  ID of the record to modify.
02716      * @param[in] data  What to append to the record.
02717      *
02718      * \note This appends \b to a record; it does \b not append a record to a file!
02719      * \sa pin_i::append_rec, \ref SSMPIN
02720      */
02721     static rc_t            append_rec(
02722         const rid_t&             rid, 
02723         const vec_t&             data
02724                 );
02725 
02726     /**\brief Chop bytes off the end of a record body.
02727      * \ingroup SSMFILE
02728      * \details
02729      * @param[in] rid  ID of the record to modify.
02730      * @param[in] amount  How many bytes to lop off.
02731      *
02732      * \sa pin_i::truncate_rec, \ref SSMPIN
02733      */
02734     static rc_t            truncate_rec(
02735         const rid_t&             rid, 
02736         smsize_t                 amount
02737     );
02738 
02739     /**\brief Chop bytes off the end of a record body.
02740      * \ingroup SSMFILE
02741      * \details
02742      * @param[in] rid  ID of the record to modify.
02743      * @param[in] amount  How many bytes to lop off.
02744      * @param[out] should_forward  Returns true if the record started out
02745      * large but is now small as a result of the truncation.  
02746      * This enables a value-added server to take action in this event,
02747      * should it so desire.
02748      *
02749      * \sa pin_i::truncate_rec, \ref SSMPIN
02750      */
02751     static rc_t            truncate_rec(
02752         const rid_t&             rid, 
02753         smsize_t                 amount,
02754         bool&                    should_forward 
02755     );
02756 
02757 #ifdef OLDSORT_COMPATIBILITY
02758     typedef ssm_sort::key_info_t key_info_t;
02759 
02760     /* old sort physical version */
02761     /**\brief Sort a file. Deprecated.
02762      * \details
02763      */
02764     static rc_t            sort_file(
02765         const stid_t&             fid, 
02766         vid_t                     vid, 
02767         stid_t&                   sfid, 
02768         store_property_t          property,
02769         const key_info_t&         key_info, 
02770         int                       run_size,
02771         bool                      ascending = true,
02772         bool                      unique = false,
02773         bool                      destructive = false,
02774         bool                      use_new_sort = true);
02775 
02776     /**\brief Sort a file. Deprecated.
02777      * \details
02778      */
02779     static rc_t            new_sort_file(
02780         const stid_t&             fid, 
02781         vid_t                     vid, 
02782         stid_t&                   sfid, 
02783         store_property_t          property,
02784         const key_info_t&         key_info, 
02785         int                       run_size,
02786         bool                      ascending = true,
02787         bool                      unique = false,
02788         bool                      destructive = false
02789         );
02790 #endif /* OLDSORT_COMPATIBILITY */
02791 
02792     typedef ssm_sort::sort_keys_t sort_keys_t;
02793 
02794     /* new sort physical version : see notes below */
02795     /**\brief Sort a file.
02796      * \ingroup SSMSORT
02797      * @param[in] fid File to sort.
02798      * @param[in] sorted_fid File to which to write the results. 
02799      * @param[in] nvids Size of array \a vid.
02800      * @param[in] vid Array of IDs of scratch files created by the caller.
02801      * @param[in] kl See sort_keys_t.
02802      * @param[in] min_rec_sz Hint of minimum record size in input file.
02803      * @param[in] run_size Number of pages in buffer pool to use for a run. 
02804      * @param[in] temp_space Number of pages to use for scratch space.
02805      * (This limits the amount of memory used by the sort).
02806      *
02807      * \details
02808      * Before you call sort_file, you must create an output file \a sorted_fid
02809      * into which sort_file will write the results.
02810      *
02811      * The sort uses temporary files when the input file contains more records
02812      * than can fit in one run (determined by \a run_size). These temporary files
02813      * may be spread across multiple volumes, which is useful if the
02814      * volumes reside on different spindles.  The arguments \a nvids
02815      * and \a vid are for indicating the volumes to use for these scratch
02816      * files.
02817      *
02818      * The caller can provide a clue in \a min_rec_size
02819      * about the minimum record size of the
02820      * input file, which can help the sort's efficiency.
02821      *
02822      * The \a run_size indicates how many buffer-pool pages to use
02823      * for each run.
02824      * Since at all times one page is fixed for output, while the rest are 
02825      * for reading the input in runs, the real run size is \a run_size-1.
02826      * 
02827      */
02828     static rc_t            sort_file(
02829         const stid_t&            fid,     // input file
02830         const stid_t&            sorted_fid, // output file 
02831         int                      nvids,    // array size for vids
02832         const vid_t*             vid,     // array of vids for temp
02833                         // files
02834                         // created by caller--
02835                         // can be same as input file
02836         sort_keys_t&            kl, // kl &
02837         smsize_t                min_rec_sz, // for estimating space use
02838         int                     run_size,   // # pages to use for a run
02839         int                     temp_space // # pages VM to use for scratch 
02840     );
02841 
02842     /**\brief Return the short volume ID of a volume.
02843      * \ingroup SSMVOL
02844      *
02845      * @param[in] lvid Long (persistent) volume ID found on the volume's
02846      * header.
02847      * @param[out] vid Short volume ID of a mounted volume.
02848      */
02849     static rc_t            lvid_to_vid(
02850         const lvid_t&          lvid,
02851         vid_t&                 vid);
02852 
02853     /**\brief Return the long volume ID of a volume.
02854      * \ingroup SSMVOL
02855      *
02856      * @param[in] vid Short volume ID of a mounted volume.
02857      * @param[out] lvid Long (persistent) volume ID found on the volume's
02858      * header.
02859      */
02860     static rc_t            vid_to_lvid(
02861         vid_t                  vid,
02862         lvid_t&                lvid);
02863 
02864     /*****************************************************************
02865      * Locking related functions
02866      *
02867      * NOTE: there are standard conversions from lpid_t, rid_t, and
02868      *       stid_t to lockid_t, so wherever a lockid_t parameter is
02869      *         specified a lpid_t, rid_t, or stid_t can be used.
02870      *
02871      *****************************************************************/
02872 
02873 #if SLI_HOOKS
02874     /* enable/disable SLI globally for all threads created after this
02875        point. Does *NOT* disable SLI for existing threads.
02876      */
02877     static void            set_sli_enabled(bool enabled);
02878     static void            set_elr_enabled(bool enabled);
02879 
02880     static rc_t            set_log_features(char const* features);
02881     static char const*         get_log_features();
02882 #endif
02883 
02884     /**\brief Acquire a lock.
02885      * \ingroup SSMLOCK
02886      * @param[in]  n  Lock id of the entity to lock. There are
02887      * conversions from record ids, volume ids, store ids, and page ids to
02888      * lockid_t.
02889      * @param[in]  m  Desired lock mode.  Values: EX, SH.
02890      * @param[in]  d  Desired duration.  Values: 
02891      * - t_very_long : Held across transaction boundaries; 
02892      *             cannot be released by unlock()
02893      * - t_long : Released at commit; cannot be released by unlock()
02894      * - t_medium : May be released early by explicit unlock()
02895      * - t_short  : May be released early by explicit unlock()
02896      * - t_instant : Not held: acquired and released immediately.  Useful
02897      *             to see if any other transaction holds an incompatible lock.
02898      * @param[in]  timeout  Milliseconds willing to block.  See timeout_in_ms.
02899      *
02900      * The lock manager is written with these durations in mind, but the
02901      * only durations used by the storage manager are t_instant and t_long.
02902      * Medium-duration locks are used internally in a one place.  
02903      *
02904      * Durations other than long and instant are not well-tested.
02905      */
02906     static rc_t            lock(
02907         const lockid_t&         n, 
02908         lock_mode_t             m,
02909         lock_duration_t         d = t_long,
02910         timeout_in_ms           timeout = WAIT_SPECIFIED_BY_XCT
02911     );
02912     
02913     /**\brief Release a lock.
02914      * \ingroup SSMLOCK
02915      * @param[in]  n  Lock id of the entity to lock. There are
02916      * conversions from record ids, volume ids, store ids, and page ids to
02917      * lockid_t.
02918      */
02919     static rc_t            unlock(const lockid_t& n);
02920 
02921     /**\brief  Disable lock escalation on the given entity. 
02922      * \ingroup SSMLOCK
02923      * @param[in]  n  Lock id of the entity to lock. There are
02924      * conversions from record ids, volume ids, store ids, and page ids to
02925      * lockid_t.
02926      * @param[in]  passOnToDescendants If true, apply this to the descendants
02927      * of \a n.
02928      */
02929     static rc_t            dont_escalate(
02930         const lockid_t&           n,
02931         bool                      passOnToDescendants = true
02932     );
02933 
02934     /**\brief  Find the storage-manager-wide escalation thresholds
02935      * \ingroup SSMLOCK
02936      * Default values (used for all transactions until they change
02937      * their per-transaction thresholds) are determined by the
02938      * storage-manager-wide options.
02939      * See \ref SSMOPT.
02940      */
02941     static rc_t            get_escalation_thresholds(
02942         w_base_t::int4_t&        toPage,
02943         w_base_t::int4_t&        toStore,
02944         w_base_t::int4_t&        toVolume);
02945 
02946     /**\brief  Change the storage-manager-wide escalation thresholds
02947      * \ingroup SSMLOCK
02948      * Default values (used for all transactions until they change
02949      * their per-transaction thresholds) are determined by the
02950      * storage-manager-wide options.
02951      * See \ref SSMOPT.
02952      */
02953     static rc_t            set_escalation_thresholds(
02954         w_base_t::int4_t       toPage,
02955         w_base_t::int4_t       toStore,
02956         w_base_t::int4_t       toVolume);
02957 
02958     /**\brief  Find out if the attached transaction has an entity locked.
02959      * \ingroup SSMLOCK
02960      * @param[in]  n  Lock id of the entity to lock. There are
02961      * conversions from record ids, volume ids, store ids, and page ids to
02962      * lockid_t.
02963      * @param[out]  m  Mode of lock held. NL if none.
02964      * @param[in]  implicit If "true" the query will returns a lock mode if
02965      * an implicit lock is held, otherwise the lock must be held explicitly.
02966      */
02967     static rc_t            query_lock(
02968         const lockid_t&        n, 
02969         lock_mode_t&           m,
02970         bool                   implicit = false
02971     );
02972 
02973     /*****************************************************************
02974      * Lock Cache related functions
02975      *
02976      * Each transaction has a cache of recently acquired locks
02977      * The following functions control the use of the cache.
02978      * Note that the functions affect the transaction currently
02979      * associated with the thread.
02980      *****************************************************************/
02981     // turn on(enable=true) or  off/(enable=false) the lock cache 
02982     // return previous state.
02983     /**\brief Control  lock caching for attached transaction.
02984      * \ingroup SSMLOCK
02985      *
02986      * @param[in] enable Set to true if you want to turn on lock caching
02987      * for the attached transaction.  The default is that it is turned on.
02988      *
02989      * Only long-duration locks are cached.
02990      * Lock caching can be turned off by default using the 
02991      * sm_lock_caching option.  Even with it turned off by default, it
02992      * can be turned on for a given transcation with this method.
02993      *
02994      */
02995     static rc_t            set_lock_cache_enable(bool enable);
02996 
02997     /**\brief True if lock cache is enabled for the attached transaction 
02998      * \ingroup SSMLOCK
02999      *
03000      * @param[out] enabled Will be set to true if the attached transaction has
03001      * lock caching enabled, false otherwise.
03002      */
03003     static rc_t            lock_cache_enabled(bool& enabled);
03004 
03005 private:
03006 
03007     static int _instance_cnt;
03008     static option_group_t* _options;
03009     static option_t* _hugetlbfs_path;
03010     static option_t* _reformat_log;
03011     static option_t* _prefetch;
03012     static option_t* _bufpoolsize;
03013     static option_t* _locktablesize;
03014     static option_t* _logdir;
03015     static option_t* _logsize;
03016     static option_t* _logbufsize;
03017     static option_t* _error_log;
03018     static option_t* _error_loglevel;
03019     static option_t* _lockEscalateToPageThreshold;
03020     static option_t* _lockEscalateToStoreThreshold;
03021     static option_t* _lockEscalateToVolumeThreshold;
03022     static option_t* _cc_alg_option;
03023     static option_t* _log_warn_percent;
03024     static option_t* _num_page_writers;
03025     static option_t* _logging;
03026     static option_t* _lock_caching_default;
03027 
03028 
03029     static rc_t            _set_option_logsize(
03030         option_t*              opt,
03031         const char*            value,
03032         ostream*               err_stream);
03033     
03034     static rc_t            _set_option_lock_escalate_to_page(
03035         option_t*              opt,
03036         const char*            value,
03037         ostream*               err_stream);
03038     
03039     static rc_t            _set_option_lock_escalate_to_store(
03040         option_t*              opt,
03041         const char*            value,
03042         ostream*               err_stream);
03043     
03044     static rc_t            _set_option_lock_escalate_to_volume(
03045         option_t*              opt,
03046         const char*            value,
03047         ostream*               err_stream);
03048     
03049     static rc_t            _set_store_property(
03050         stid_t                stid,
03051         store_property_t      property);
03052 
03053     static rc_t            _get_store_property(
03054         stid_t                stid,
03055         store_property_t&     property);
03056 
03057     static rc_t         _begin_xct(
03058         sm_stats_info_t*      stats,  // allocated by caller
03059         tid_t&                tid, 
03060         timeout_in_ms         timeout);
03061 
03062     static rc_t            _commit_xct(
03063         sm_stats_info_t*&     stats,
03064         bool                  lazy,
03065         lsn_t* plastlsn);
03066 
03067     static rc_t            _prepare_xct(
03068         sm_stats_info_t*&     stats,
03069         vote_t&                v);
03070 
03071     static rc_t            _set_coordinator(const server_handle_t &); 
03072     
03073     static rc_t            _enter_2pc(const gtid_t &); 
03074     static rc_t            _force_vote_readonly(); 
03075     static rc_t            _recover_2pc(const gtid_t &,// in
03076                                 bool    mayblock,
03077                                 tid_t    &    //out -- attached if found(?)
03078                             );
03079     static rc_t            _chain_xct(
03080         sm_stats_info_t*&      stats,
03081         bool                   lazy);
03082 
03083     static rc_t            _abort_xct(
03084         sm_stats_info_t*&      stats);
03085 
03086     static rc_t            _save_work(sm_save_point_t& sp);
03087 
03088     static rc_t            _rollback_work(const sm_save_point_t&        sp);
03089     static rc_t            _mount_dev(
03090         const char*            device,
03091         u_int&                 vol_cnt,
03092         vid_t                  local_vid);
03093 
03094     static rc_t            _dismount_dev(
03095         const char*            device,
03096         bool                   dismount_if_locked = true
03097     );
03098     static rc_t            _create_vol(
03099         const char*            device_name,
03100         const lvid_t&          lvid,
03101         smksize_t              quota_KB,
03102         bool                   skip_raw_init,
03103         const bool             apply_fake_io_latency,
03104         const int              fake_disk_latency);
03105 
03106     static rc_t            _create_index(
03107         vid_t                 vid, 
03108         ndx_t                 ntype, 
03109         store_property_t      property,
03110         const char*           key_desc,
03111         concurrency_t         cc,
03112         stid_t&               stid
03113     );
03114 
03115     static rc_t            _destroy_index(const stid_t& iid); 
03116 
03117     static rc_t            _get_store_info( 
03118         const stid_t  &       stid, 
03119         sm_store_info_t&      info);
03120 
03121     static rc_t            _bulkld_index(
03122         const stid_t&         stid,
03123         int                   nsrcs,
03124         const stid_t*         source,
03125         sm_du_stats_t&        stats,
03126         bool                  sort_duplicates = true,
03127         bool                  lexify_keys = true
03128     );
03129 
03130     static rc_t            _bulkld_index(
03131         const stid_t&          stid, 
03132         sort_stream_i&         sorted_stream,
03133         sm_du_stats_t&         stats
03134     );
03135 
03136     static rc_t            _print_index(const stid_t &iid);
03137 
03138     static rc_t            _create_assoc(
03139         const stid_t  &        stid, 
03140         const vec_t&           key, 
03141         const vec_t&           el
03142 #ifdef SM_DORA
03143         , const bool             bIgnoreLocks = false
03144 #endif
03145     );
03146 
03147     static rc_t            _destroy_assoc(
03148         const stid_t &        stid, 
03149         const vec_t&          key,
03150         const vec_t&          el
03151 #ifdef SM_DORA
03152         , const bool             bIgnoreLocks = false
03153 #endif
03154     );
03155 
03156     static rc_t            _destroy_all_assoc(
03157         const stid_t&        stid, 
03158         const vec_t&         key,
03159         int&                 num_removed
03160     );
03161     static rc_t            _find_assoc(
03162         const stid_t&        stid, 
03163         const vec_t&         key, 
03164         void*                el, 
03165         smsize_t&            elen, 
03166         bool&                found
03167 #ifdef SM_DORA
03168         , const bool             bIgnoreLocks = false
03169 #endif
03170     );
03171 
03172     // below method overloaded for rtree
03173     static rc_t            _create_md_index(
03174         vid_t                 vid, 
03175         ndx_t                 ntype, 
03176         store_property_t      property,
03177         stid_t&               stid, 
03178         int2_t                dim=2
03179     );
03180 
03181     static rc_t            _destroy_md_index(const stid_t& iid);
03182 
03183     static rc_t            _destroy_md_assoc(
03184         stid_t                stid,
03185         const nbox_t&         key,
03186         const vec_t&          el);
03187 
03188     static rc_t            _bulkld_md_index(
03189         const stid_t&         stid, 
03190         int                   nsrcs,
03191         const stid_t*         source, 
03192         sm_du_stats_t&        stats,
03193         int2_t                hff,           // for rtree only
03194         int2_t                hef,           // for rtree only
03195         nbox_t*               universe);// for rtree only
03196 
03197     static rc_t            _bulkld_md_index(
03198         const stid_t&         stid, 
03199         sort_stream_i&        sorted_stream,
03200         sm_du_stats_t&        stats,
03201         int2_t                hff,           // for rtree only
03202         int2_t                hef,           // for rtree only
03203         nbox_t*               universe);// for rtree only
03204 
03205     static rc_t            _print_md_index(stid_t stid);
03206 
03207     static rc_t            _create_md_assoc(
03208         stid_t                stid, 
03209         const nbox_t&         key,
03210         const vec_t&          el);
03211 
03212     static rc_t            _find_md_assoc(
03213         stid_t                stid, 
03214         const nbox_t&         key, 
03215         void*                 el, 
03216         smsize_t&             elen, 
03217         bool&                 found);
03218 
03219     //
03220     // The following functions deal with files of records.
03221     //
03222     static rc_t            _destroy_n_swap_file(
03223         const stid_t&         old_fid,
03224         const stid_t&         new_fid);
03225 
03226     static rc_t            _create_file(
03227         vid_t                 vid, 
03228         stid_t&               fid,
03229         store_property_t     property,
03230         shpid_t              cluster_hint = 0
03231     ); 
03232 
03233     static rc_t            _destroy_file(const stid_t& fid); 
03234 
03235     static rc_t            _create_rec(
03236         const stid_t&            fid, 
03237         const vec_t&             hdr, 
03238         smsize_t                 len_hint, 
03239         const vec_t&             data, 
03240         rid_t&                   new_rid
03241 #ifdef SM_DORA
03242         , const bool             bIgnoreLocks = false
03243 #endif
03244         ); 
03245 
03246     static rc_t            _destroy_rec(
03247         const rid_t&             rid
03248 #ifdef SM_DORA
03249         , const bool             bIgnoreLocks = false
03250 #endif
03251         );
03252 
03253     static rc_t            _update_rec(
03254         const rid_t&             rid, 
03255         smsize_t                 start, 
03256         const vec_t&             data
03257 #ifdef SM_DORA
03258         , const bool             bIgnoreLocks = false
03259 #endif
03260         );
03261 
03262     static rc_t            _update_rec_hdr(
03263         const rid_t&             rid, 
03264         smsize_t                 start, 
03265         const vec_t&             hdr
03266 #ifdef SM_DORA
03267         , const bool             bIgnoreLocks = false
03268 #endif
03269         );
03270 
03271     static rc_t            _append_rec(
03272         const rid_t&             rid, 
03273         const vec_t&             data
03274         );
03275 
03276     static rc_t            _truncate_rec(
03277             const rid_t&         rid, 
03278             smsize_t             amount,
03279             bool&                should_forward
03280         );
03281 
03282     static rc_t            _draw_rtree(const stid_t& stid, ostream &);
03283 
03284     static rc_t            _rtree_stats(
03285             const stid_t&       stid,
03286             rtree_stats_t&      stat,
03287             uint2_t             size,
03288             uint2_t*            ovp,
03289             bool                audit
03290         );
03291 
03292 #ifdef OLDSORT_COMPATIBILITY
03293     /* old sort internal, physical */
03294     static rc_t            _sort_file(
03295         const stid_t&           fid, 
03296         vid_t                   vid, 
03297         stid_t&                 sfid, 
03298         store_property_t        property,
03299         const key_info_t&       key_info, 
03300         int                     run_size,
03301         bool                    ascending,
03302         bool                    unique,
03303         bool                    destructive
03304     );
03305 #endif /* OLDSORT_COMPATIBILITY */
03306 
03307     /* new sort internal, physical */
03308     static rc_t            _sort_file(
03309         const stid_t&             fid,     // input file
03310         const stid_t&             sorted_fid, // output file -- 
03311                         // created by caller--
03312                         // can be same as input file
03313         int                      nvids,    // array size for vids
03314         const vid_t*             vid,     // array of vids for temp
03315         sort_keys_t&             kl,     // key location info &
03316         smsize_t                 min_rec_sz, // for estimating space use
03317         int                      run_size,   // # pages to use for a run
03318         int                      temp_space //# pages VM to use for scratch 
03319     );
03320 
03321 
03322 #ifdef OLDSORT_COMPATIBILITY
03323     /* internal compatibility old sort-> new sort */
03324     static rc_t            _new_sort_file(
03325             const stid_t&         in_fid, 
03326             const stid_t&         out_fid, 
03327             const key_info_t&    ki, 
03328             int                  run_size,
03329             bool                  ascending, 
03330             bool                  unique, 
03331             bool                  keep_orig //!destructive
03332             ); 
03333 #endif /* OLDSORT_COMPATIBILITY */
03334 
03335     static store_flag_t     _make_store_flag(store_property_t property);
03336     // reverse function:
03337     // static store_property_t    _make_store_property(w_base_t::uint4_t flag);
03338     // is in dir_vol_m
03339 
03340     // this is for df statistics  DU DF
03341     static rc_t            _get_du_statistics(
03342         vid_t                  vid, 
03343         sm_du_stats_t&         du,
03344         bool                   audit);
03345 
03346     static rc_t            _get_du_statistics(
03347         const stid_t  &        stid, 
03348         sm_du_stats_t&         du,
03349         bool                   audit);
03350 
03351     static rc_t            _get_volume_meta_stats(
03352         vid_t                  vid,
03353         SmVolumeMetaStats&     volume_stats,
03354         concurrency_t          cc);
03355 
03356     static rc_t            _get_file_meta_stats(
03357         vid_t                  vid,
03358         w_base_t::uint4_t      num_files,
03359         SmFileMetaStats*       file_stats,
03360         bool                   batch_calculate,
03361         concurrency_t          cc);
03362 };
03363 
03364 /**\brief Information about a store that can be queried by the client.
03365  * \details
03366  * This information is stored in a store directory on the volume.
03367  * It can be queried with ss_m::get_store_info.
03368  */
03369 class sm_store_info_t {
03370 public:
03371     NORET sm_store_info_t(int len) :
03372                 store(0), stype(ss_m::t_bad_store_t), 
03373                 ntype(ss_m::t_bad_ndx_t), cc(ss_m::t_cc_bad),
03374                 eff(0), large_store(0), root(0),
03375                 nkc(0), keydescrlen(len)
03376                 {  keydescr = new char[len]; }
03377 
03378     NORET ~sm_store_info_t() { if (keydescr) delete[] keydescr; }
03379 
03380     /// store number
03381     snum_t    store;        
03382     /// t_index, t_file, ... See ss_m::store_t.
03383     u_char    stype;        
03384     /// t_btree, t_rtree,... See ss_m::ndx_t
03385     u_char    ntype;        
03386     /// t_cc_kvl, t_cc_record,... See ss_m::concurrency_t
03387     u_char    cc;         
03388 
03389     /// Unused:
03390     u_char    eff;        
03391 
03392     /// Store number for associated large-page store, if there is one.
03393     snum_t    large_store; 
03394     /// Root page if this is an index.
03395     shpid_t    root;        
03396     /// Number of key components if this is an index.
03397     w_base_t::uint4_t    nkc;  
03398     /// Size of key description (if this is an index)
03399     int        keydescrlen;    
03400     /**\brief Variable length string.
03401      *
03402      * He who creates a sm_store_info_t for use with get_store_info()
03403      * is responsible for allocating enough space for 
03404      * key descriptors if he expects to find them.
03405      * See \ref key_description.
03406      */
03407     char        *keydescr;    
03408 };
03409 
03410 
03411 ostream& operator<<(ostream& o, const vid_t& v);
03412 istream& operator>>(istream& i, vid_t& v);
03413 ostream& operator<<(ostream& o, const extid_t& x);
03414 istream& operator>>(istream& o, extid_t &x);
03415 ostream& operator<<(ostream& o, const stid_t& stid);
03416 istream& operator>>(istream& i, stid_t& stid);
03417 ostream& operator<<(ostream& o, const lpid_t& pid);
03418 istream& operator>>(istream& i, lpid_t& pid);
03419 ostream& operator<<(ostream& o, const shrid_t& r);
03420 istream& operator>>(istream& i, shrid_t& r);
03421 ostream& operator<<(ostream& o, const rid_t& rid);
03422 istream& operator>>(istream& i, rid_t& rid);
03423 ostream& operator<<(ostream& o, const sm_stats_info_t& s);
03424 template<class ostream>
03425 ostream& operator<<(ostream& o, const sm_config_info_t& s)
03426 {
03427     o    << "  page_size " << s.page_size
03428      << "  max_small_rec " << s.max_small_rec
03429      << "  lg_rec_page_space " << s.lg_rec_page_space
03430      << "  buffer_pool_size " << s.buffer_pool_size
03431      << "  max_btree_entry_size " << s.max_btree_entry_size
03432      << "  exts_on_page " << s.exts_on_page
03433      << "  pages_per_ext " << s.pages_per_ext
03434      << "  multi_threaded_xct " << s.multi_threaded_xct
03435      << "  logging " << s.logging
03436       ;
03437     return o;
03438 }
03439 
03440 
03441 #ifndef VEC_T_H
03442 #include <vec_t.h>
03443 #endif
03444 
03445 #ifndef SM_ESCALATION_H
03446 #include <sm_escalation.h>
03447 #endif
03448 
03449 /*<std-footer incl-file-exclusion='SM_H'>  -- do not edit anything below this line -- */
03450 
03451 #endif          /*</std-footer>*/

Generated on Wed Jul 7 17:22:32 2010 for Shore Storage Manager by  doxygen 1.4.7