00001 /* -*- mode:C++; c-basic-offset:4 -*- 00002 Shore-MT -- Multi-threaded port of the SHORE storage manager 00003 00004 Copyright (c) 2007-2009 00005 Data Intensive Applications and Systems Labaratory (DIAS) 00006 Ecole Polytechnique Federale de Lausanne 00007 00008 All Rights Reserved. 00009 00010 Permission to use, copy, modify and distribute this software and 00011 its documentation is hereby granted, provided that both the 00012 copyright notice and this permission notice appear in all copies of 00013 the software, derivative works or modified versions, and any 00014 portions thereof, and that both notices appear in supporting 00015 documentation. 00016 00017 This code is distributed in the hope that it will be useful, but 00018 WITHOUT ANY WARRANTY; without even the implied warranty of 00019 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS 00020 DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER 00021 RESULTING FROM THE USE OF THIS SOFTWARE. 00022 */ 00023 00024 /*<std-header orig-src='shore' incl-file-exclusion='SM_H'> 00025 00026 $Id: sm.h,v 1.314 2010/07/07 20:50:24 nhall Exp $ 00027 00028 SHORE -- Scalable Heterogeneous Object REpository 00029 00030 Copyright (c) 1994-99 Computer Sciences Department, University of 00031 Wisconsin -- Madison 00032 All Rights Reserved. 00033 00034 Permission to use, copy, modify and distribute this software and its 00035 documentation is hereby granted, provided that both the copyright 00036 notice and this permission notice appear in all copies of the 00037 software, derivative works or modified versions, and any portions 00038 thereof, and that both notices appear in supporting documentation. 00039 00040 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY 00041 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS 00042 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND 00043 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 00044 00045 This software was developed with support by the Advanced Research 00046 Project Agency, ARPA order number 018 (formerly 8230), monitored by 00047 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518. 00048 Further funding for this work was provided by DARPA through 00049 Rome Research Laboratory Contract No. F30602-97-2-0247. 00050 00051 */ 00052 00053 #ifndef SM_H 00054 #define SM_H 00055 00056 #include "w_defines.h" 00057 00058 /* -- do not edit anything above this line -- </std-header>*/ 00059 00060 /* 00061 * Stuff needed by value-added servers. NOT meant to be included by 00062 * internal SM .c files, except to the extent that they need these 00063 * definitions used in the API. 00064 */ 00065 00066 #ifdef __GNUG__ 00067 #pragma interface 00068 #endif 00069 00070 #ifndef SM_INT_4_H 00071 #include <sm_int_4.h> 00072 #endif 00073 00074 #ifndef SM_DU_STATS_H 00075 #include <sm_du_stats.h> // declares sm_du_stats_t 00076 #endif 00077 00078 #ifndef SM_STATS_H 00079 #include <smstats.h> // declares sm_stats_info_t and sm_config_info_t 00080 #endif 00081 00082 #ifndef SM_S_H 00083 #include <sm_s.h> // declares key_type_s, rid_t, lsn_t 00084 #endif 00085 00086 #ifndef LEXIFY_H 00087 #include <lexify.h> // declares sortorder with constants 00088 #endif 00089 00090 #ifndef NBOX_H 00091 #include <nbox.h> // key_info_t contains nbox_t 00092 #endif /* NBOX_H */ 00093 00094 #ifndef SORT_S_H 00095 #include <sort_s.h> // declares key_info_t 00096 #endif 00097 00098 /* DOXYGEN Documentation : */ 00099 00100 /**\addtogroup LOGSPACE 00101 * 00102 * Updates performed by transactions are logged so that 00103 * the can be rolled back (in the event of a transaction abort) 00104 * or restored (in the event of a crash). Both the old and new values 00105 * of an updated location are logged. This allows a steal, no-force 00106 * buffer management policy, which means the buffer manager is free 00107 * to write dirty pages to disk at any time and yet does not have 00108 * to write dirty pages for a a transaction to commit. 00109 * 00110 * The log is stored in a set of Unix files, all in the same directory, 00111 * whose path is determined by a run-time option. 00112 * The maximum size of the log is also determined by a run-time option.o 00113 * The proper value of the log size depends on 00114 * the expected transaction mix. More specifically, it depends on the 00115 * age of the oldest (longest running) transaction in the system and 00116 * the amount of log space used by all active transactions. Here are 00117 * some general rules to determine the amount of free log space 00118 * available in the system. 00119 * - Log records between the first log 00120 * record generated by the oldest active transaction and the most 00121 * recent log record generated by any transaction cannot be thrown 00122 * away. 00123 * - Log records from a transaction are no longer needed 00124 * once the transaction has committed or completely aborted and all 00125 * updates have made it to disk. Aborting a transaction causes log space 00126 * to be used, so space is reserved for aborting each transaction. 00127 * Enough log space must be available to commit or abort all active 00128 * transactions at all times. 00129 * 00130 * - Only space starting at the beginning of the log can be reused. 00131 * This space can be reused if it contains log records only for 00132 * transactions meeting the previous rule. 00133 * 00134 * - All storage manager calls that update records require log space twice 00135 * the size of the space updated in the record. All calls that create, 00136 * append, or truncate records require log space equal to the size 00137 * created, inserted, or deleted. Log records generated by these calls 00138 * (generally one per call) have an overhead of approximately 50 bytes. 00139 * 00140 * - The amount of log space reserved for aborting a transaction is equal to 00141 * the amount of log space generated by the transaction plus a fudge 00142 * factor. 00143 * (Where btrees are concerned, a structure modification 00144 * might be necessary on abort, using more space on abort, or might not be 00145 * necessary on abort where it was done during forward processing, 00146 * using less space on abort.) 00147 * 00148 * - The transaction assumes responsiblity for reserving space in the 00149 * log so that it can abort, should it need to (without leaving an 00150 * unrecoverable volume). The transaction and the log cooperate to 00151 * reserve space for the transaction's aborting. 00152 * 00153 * - When insufficient log space is available for a transaction, the 00154 * transaction is (may be, depending on the server) aborted. 00155 * The storage manager will return an error indication (out of log space) 00156 * if it is unable to insert a log record into the log due to 00157 * insufficient space. 00158 * 00159 * \bug GNATS 142 There remain a number of places in the storage manager code 00160 * that react to a lack of log space with a fatal error; this is a 00161 * hold-over from the original storage manager, before any attempt to 00162 * reserve space was in place. This code has to be rewritten to handle more 00163 * gracefully such errors. In order for this to be done, the 00164 * multi-threaded transaction support will be deprecated. 00165 * 00166 * Checkpoints are taken periodically by the storage manager in order to 00167 * free log space and shorten recovery time. Checkpoints are "fuzzy" 00168 * and can do not require the system to pause while they are completing. 00169 * 00170 * See the storage manager constructor ss_m::ss_m for more information 00171 * about handling out-of-logspace conditions. 00172 * 00173 */ 00174 00175 /**\addtogroup SSMOPT 00176 * 00177 * These are the run-time options for the storage manager. 00178 * 00179 * -sm_bufpoolsize : 00180 * - type: number 00181 * - description: This is the size of 00182 * the buffer pool in Kb. Must be large enough to hold at least 32 pages, 00183 * so it depends on the configured page size. 00184 * - default: none 00185 * - required?: yes 00186 * 00187 * -sm_hugetlbfs_path 00188 * - type: string (full absolute path name) 00189 * - description: Needed only if you configured --with-hugetlbfs. 00190 * - default: see \ref CONFIGOPT 00191 * - required?: no 00192 * 00193 * -sm_reformat_log 00194 * - type: Boolean 00195 * - description: If "yes", your log will be clobbered and the storage 00196 * manager will start up with an entirely new log. 00197 * - default: no 00198 * - required?: no 00199 * 00200 * -sm_logdir 00201 * - type: string (relative or absolutee path name) 00202 * - description: Location of the log files. 00203 * - default: none 00204 * - required?: yes 00205 * 00206 * -sm_logbufsize 00207 * - type: number 00208 * - description: size of log buffer in KB. 00209 * Must be greater than or equal to the larger of 00210 * (4 times the page size, 64 Kb) 00211 * and less than or equal to 00212 * 128 times the page_size. This is the size of 00213 * the log buffer in Kb. 00214 * - default: 128 00215 * - required?: no 00216 * 00217 * -sm_logsize 00218 * - type: number 00219 * - description: greater than or equal to 8256 00220 * This is the maximum size of the log in Kb. It is a function of 00221 * the log buffer size, and the default is the minimum allowable for 00222 * the default sm_logbufsize. 00223 * - default: 128 00224 * - required?: yes 00225 * 00226 * -sm_log_warn 00227 * - type: number between 0 and 100 (percentage) 00228 * - description: percentage of log that, when consumed by active 00229 * transactions, triggers a callback warning of potential inability 00230 * to roll back. Should be less than 50. 00231 * - default: 45 00232 * - required?: no 00233 * 00234 * -sm_errlog 00235 * - type: string (relative or absolute path name OR - ) 00236 * - description: Destination for error messages. If "-" is given, 00237 * the destination is stderr. 00238 * - default: \b - 00239 * - required?: no 00240 * 00241 * -sm_errlog_level 00242 * - type: string (one of none|emerg|fatal|internal|error|warning|info|debug) 00243 * - description: filter. Message of this priority or higher are issued to 00244 * the error log; messages with lower priority are not issued. 00245 * The priorities are listed from high to low. "none" means no logging 00246 * will happen. 00247 * - default: error 00248 * - required?: no 00249 * 00250 * -sm_locktablesize : 00251 * - type: number greater than or equal to 64 00252 * - description: size of lock manager's hash table will be a prime 00253 * number near and greater than the given number. 00254 * - default: 64000 (yields a hash table with 65521 buckets) 00255 * - required?: no 00256 * 00257 * -sm_lock_escalate_to_page_threshold 00258 * - type: number greater than or equal to 0 00259 * - description: after acquiring this many record locks on a page, the lock 00260 * will be escalated to a page lock. A value of 0 disables escalation to a 00261 * page lock. 00262 * - default: 5 00263 * - required?: no 00264 * 00265 * -sm_lock_escalate_to_store_threshold 00266 * - type: number greater than or equal to 0 00267 * - description: after acquiring this many page locks on in a store, 00268 * the lock will be escalated to a store lock. 00269 * A value of 0 disables escalation to a store lock. 00270 * - default: 25 00271 * - required?: no 00272 * 00273 * -sm_lock_escalate_to_volume_threshold 00274 * - type: number greater than or equal to 0 00275 * - description: after acquiring this many store locks on in a volume, 00276 * the lock will be escalated to a volume lock. 00277 * A value of 0 disables escalation to a volume lock. 00278 * - default: 0 00279 * - required?: no 00280 * 00281 * -sm_cc_alg 00282 * - type: string (one of file | page | record | none) 00283 * - description: default locking granularity for file operations. 00284 * This can be overridden on a per-transaction basis with 00285 * ss_m::set_xct_lock_level(). 00286 * - default: record 00287 * - required?: no 00288 * 00289 * -sm_backgroundflush 00290 * - type: Boolean 00291 * - description: Enables background-flushing of volumes. 00292 * Must be set to "yes" for sm_num_page_writers to have any effect. 00293 * - default: yes 00294 * - required?: no 00295 * 00296 * -sm_num_page_writers 00297 * - type: number 00298 * - description: greater than or equal to 0; this is the number of 00299 * background-flushing threads for each volume. If you have 00300 * lots of threads, 00301 * a huge buffer pool, and few volumes, you should increase this. 00302 * If sm_backgroundflush is "no", this value is ignored. 00303 * - default: 2 00304 * - required?: no 00305 * 00306 * -sm_prefetch 00307 * - type: Boolean 00308 * - description: Enables prefetching for scans. 00309 * - default: no 00310 * - required?: no 00311 * 00312 * -sm_logging 00313 * - type: Boolean 00314 * - description: Allows you to turn off logging for a run of 00315 * the storage manager. This is only for experimentation, to 00316 * measure logging overhead in a limited way. 00317 * Aborts, rollbacks and restart/recovery 00318 * do not work without logging. Independent concurrent 00319 * transactions using btrees might not work without logging (this is 00320 * not well-tested). 00321 * Each time you start the server, you had better start with a 00322 * clean device or a device that resulted from a clean shutdown 00323 * of the prior run. 00324 * - default: yes 00325 * - required?: no 00326 * 00327 * -sm_lock_caching 00328 * - type: Boolean 00329 * - description: Enables caching of transaction locks in transaction. 00330 * Can be turned off for experimentation. If no, the default is not 00331 * to cache locks, but any transaction can turn on caching for itself 00332 * by calling the ss_m method set_lock_cache_enable(bool enable). 00333 * - default: yes 00334 * - required?: no 00335 * 00336 * \sa \ref SSMVAS 00337 */ 00338 00339 00340 /**\addtogroup SSMXCT 00341 * All storage manager operations on data must be done within the scope of 00342 * a transaction (ss_m::begin_xct, ss_m::commit_xct, ss_m::abort_xct, 00343 * ss_m::chain_xct). 00344 * 00345 * A very few storage manager operations, such as formatting a volume, are 00346 * called outside the scope of a transaction and the storage manager begins 00347 * its own transaction to do the work. 00348 * 00349 * Operations that fail return an error indication and the storage 00350 * manager assumes that the server will thereafter abort the 00351 * transaction in which the error occurred, when abort is indicated. 00352 * Abort is indicated when eUSERABORT or eDEADLOCK is returned and 00353 * when the erver chooses to abort rather than to work around the problem 00354 * (whatever it might be, such as eRETRY). 00355 * 00356 * The storage manager does not enforce the aborting of any erroneous 00357 * transactions except, possibly, those that are in danger of 00358 * running out of log space. 00359 * (This is done with the destructor of the prologue used on each call 00360 * to the storage manager, see next paragraph). 00361 * 00362 * It is always the server's responsibility to abort. 00363 * When the storage manager 00364 * encounters a eLOGSPACEWARN condition (the log hasn't enough 00365 * space \e at \e this \e moment to abort the running transaction, 00366 * assuming a 1:1 ration of rollback-logging overhead to forward-processing 00367 * logging overhead), it does one of two things: 00368 * - passes the error code eLOGSPACEWARN up the call stack back to the server 00369 * if the storage manager was constructed with no log-space-warning callback 00370 * argument (see LOG_WARN_CALLBACK_FUNC, ss_m::ss_m). 00371 * - tries to abort a transaction before passing an error code back up 00372 * the call stack to the server. Choosing a victim transaction to abort 00373 * is done by the server in its log-space-warning callback function (passed 00374 * in on ss_m::ss_m, q.v. 00375 * Only if that callback function returns a non-null victim transaction 00376 * and returns eUSERABORT does the storage manager abort that victim 00377 * before returning eUSERABORT up the call stack. Any other 00378 * error code returned by the callback function is just returned up 00379 * the call stack. 00380 * 00381 * \section LOCKS Locks 00382 * 00383 * The storage manager automatically acquires the 00384 * necessary locks when the data are read or written. 00385 * The locks thus acquired are normally released at the end of a transaction, 00386 * thus, by default, transactions are two-phase and well-formed (degree 3). 00387 * 00388 * \subsection GRAN Lock Granularity 00389 * The fine-grained locks are normally used for records in files, but 00390 * provision is made for using coarser-grained locks. The transaction 00391 * has a default lock level associated with it, 00392 * which governs the granularity of locks acquired by the storage manager 00393 * on behalf of the transaction. 00394 * The lock manager provides for lock escalation to coarser locks to 00395 * reduce the locking costs. See \ref SSMLOCK and smlevel_0::concurrency_t. 00396 * 00397 * Key-value locking is normally used for B+-Trees. (See \ref MOH1.) 00398 * R*-Trees normally use coarse-granularity locking. 00399 * The locking protocol used with an index is determined when the 00400 * index is created. A transaction may acquire coarse (index-level) 00401 * locks with explicit calls to the lock manager, but by default, 00402 * the granularity/level/protocol associated with the index is used. 00403 * See smlevel_0::concurrency_t. 00404 * 00405 * \section DISTXCT Distributed Transactions 00406 * Storage manager transactions may be used as "threads" (to 00407 * overload this term) of distributed transactions. 00408 * Coordination of 2-phase commit must be done externally, 00409 * but the storage manager supports preparing the (local) transaction "thread" 00410 * for two-phase commit, and it will log the necessary 00411 * data for recovering in-doubt transactions. 00412 * 00413 * \section ATTACH Threads and Transactions 00414 * Transactions are not tied to storage manager threads (smthread_t, not 00415 * to be confused with a local "thread" of a distributed transaction) in any 00416 * way other than that a transaction must be \e attached to a 00417 * thread while any storage manager work is being done on behalf of 00418 * that transaction. This is how the storage manager knows \e which 00419 * transaction is to acquire the locks and latches, etc. 00420 * But a thread can attach and detach from transactions at will, so 00421 * work may be performed by different threads each time the storage 00422 * manager is called on behalf of a given transaction; this allows the 00423 * server to keep a pool of threads to perform work and allows them to 00424 * perform work on behalf of any active transaction. 00425 * 00426 * \warning 00427 * While there are limited circumstances in which multiple threads can be 00428 * attached to the same transaction \e concurrently and perform storage 00429 * manager operations on behalf of that transaction concurrently, 00430 * which is a hold-over from the original storage manager, this 00431 * functionality will be deprecated soon. The reason for this being 00432 * removed is that it is extremely difficult to handle errors internally 00433 * when multiple threads are attached to a transaction because 00434 * partial rollback is impossible in the absence of multiple log streams 00435 * for a transaction. 00436 * 00437 * Under no circumstances may a thread attach to more than one transaction 00438 * at a time. 00439 * 00440 * 00441 * \section EXOTICA Exotica 00442 * The storage manager also provides 00443 * - partial rollback (ss_m::save_work and ss_m::rollback_work), 00444 * which undoes actions but does not release locks, 00445 * - transaction chaining (ss_m::chain_xct), which commits, but retains locks 00446 * and gives them to a new transaction, 00447 * - lock release (sm_quark_t, ss_m::unlock), allowing less-than-3-degree 00448 * transactions. 00449 * 00450 * To reduce the cost (particularly in logging) of loading databases, 00451 * the storage manager provides for unlogged loading of stores. 00452 * See \ref SSMSTORE. 00453 */ 00454 00455 00456 /**\addtogroup SSMDEBUG 00457 * 00458 * \section DEBUGLEV Build-time Debugging Options 00459 * At configure time, you can control which debugger-related options 00460 * (symbols, inlining, etc) with the debug-level options. See \ref CONFIGOPT. 00461 * \section SSMTRACE Tracing (--enable-trace) 00462 * When this build option is used, additional code is included in the build to 00463 * enable some limited tracing. These C Preprocessor macros apply: 00464 * -W_TRACE 00465 * --enable-trace defines this. 00466 * -FUNC 00467 * Outputs the function name when the function is entered. 00468 * -DBG 00469 * Outputs the arguments. 00470 * -DBGTHRD 00471 * Outputs the arguments. 00472 * 00473 * The tracing is controlled by these environment variables: 00474 * -DEBUG_FLAGS: a list of file names to trace, e.g. "smfile.cpp log.cpp" 00475 * -DEBUG_FILE: name of destination for the output. If not defined, the output 00476 * is sent to cerr/stderr. 00477 * 00478 * See \ref CONFIGOPT. 00479 * \note This tracing is not thread-safe, as it uses streams output. 00480 * \section SSMENABLERC Return Code Checking (--enable-checkrc) 00481 * If a w_rc_t is set but not checked with method is_error(), upon destruction the 00482 * w_rc_t will print a message to the effect "error not checked". 00483 * See \ref CONFIGOPT. 00484 * 00485 */ 00486 00487 /** \file sm_vas.h 00488 * \details 00489 * This is the include file that all value-added servers should 00490 * include to get the Shore Storage Manager API. 00491 * 00492 */ 00493 /********************************************************************/ 00494 00495 class page_p; 00496 class xct_t; 00497 class device_m; 00498 class vec_t; 00499 class log_m; 00500 class lock_m; 00501 class btree_m; 00502 class file_m; 00503 class pool_m; 00504 class dir_m; 00505 class chkpt_m; 00506 class lid_m; 00507 class sm_stats_cache_t; 00508 class option_group_t; 00509 class option_t; 00510 class prologue_rc_t; 00511 class rtree_m; 00512 class sort_stream_i; 00513 00514 /**\addtogroup SSMSP 00515 * A transaction may perform a partial rollback using savepoints. 00516 * The transaction populates a savepoint by calling ss_m::save_work, 00517 * then it may roll back to that point with ss_m::rollback_work. 00518 * Locks acquired between the save_work and rollback_work are \e not 00519 * released. 00520 */ 00521 00522 /**\brief A point to which a transaction can roll back. 00523 * \ingroup SSMSP 00524 *\details 00525 * A transaction an do partial rollbacks with 00526 * save_work and rollback_work, which use this class to determine 00527 * how far to roll back. 00528 * It is nothing more than a log sequence number for the work done 00529 * to the point when save_work is called. 00530 */ 00531 class sm_save_point_t : public lsn_t { 00532 public: 00533 NORET sm_save_point_t(): _tid(0,0) {}; 00534 friend ostream& operator<<(ostream& o, const sm_save_point_t& p) { 00535 return o << p._tid << ':' << (const lsn_t&) p; 00536 } 00537 friend istream& operator>>(istream& i, sm_save_point_t& p) { 00538 char ch; 00539 return i >> p._tid >> ch >> (lsn_t&) p; 00540 } 00541 tid_t tid() const { return _tid; } 00542 private: 00543 friend class ss_m; 00544 tid_t _tid; 00545 }; 00546 00547 /**\addtogroup SSMQK 00548 * A quark is a marker in the transaction's list of acquired locks. 00549 * One may release all short-duration locks acquired since the quark was inserted 00550 * into the list via sm_quark_t::open(). 00551 * The lock manager modifies the locks acquired inside a quark 00552 * so that non-extent locks are no longer than short-duration. 00553 * 00554 * This is for experimentation only, and is \e not well-tested or supported. 00555 * 00556 * How used: 00557 * \code 00558 * sm_quark_t *q = new sm_quark_t; 00559 * q->open(); // inserts marker in transaction's list. 00560 * ... 00561 * q->close(); // frees short-duration locks to the marker. 00562 * delete q; 00563 * \endcode 00564 * 00565 * Deleting the quark without closing it causes it to be closed. 00566 * Quarks may \e not be used with multi-threaded transactions. 00567 * 00568 * Note that if a transaction has multiple threads attached when 00569 * a thread opens a quark, there is no way to determine where the 00570 * quark takes effect, and since it affects the locks acquired by 00571 * all threads of the transaction, it must be used very carefully 00572 * where multiply-threaded transactions are concerned. 00573 */ 00574 00575 /**\brief List of locks acquired by a transaction since 00576 * the quark was "opened". 00577 * \ingroup SSMQK 00578 * \details 00579 * When a quark is closed (by calling close()), 00580 * the release_locks parameter indicates if all short-duration read 00581 * locks acquired during the quark should be released. 00582 * \note Quarks are an experimental feature for use 00583 * as a building block for a more general nested-transaction facility. 00584 * 00585 * \internal See lock_x.h 00586 */ 00587 class sm_quark_t { 00588 public: 00589 NORET sm_quark_t() {} 00590 NORET ~sm_quark_t(); 00591 00592 rc_t open(); 00593 rc_t close(bool release=true); 00594 00595 tid_t tid()const { return _tid; } 00596 operator bool()const { return (_tid != tid_t::null); } 00597 friend ostream& operator<<(ostream& o, const sm_quark_t& q); 00598 friend istream& operator>>(istream& i, sm_quark_t& q); 00599 00600 private: 00601 friend class ss_m; 00602 tid_t _tid; 00603 00604 // disable 00605 sm_quark_t(const sm_quark_t&); 00606 sm_quark_t& operator=(const sm_quark_t&); 00607 00608 }; 00609 00610 class sm_store_info_t; 00611 class log_entry; 00612 class coordinator; 00613 class tape_t; 00614 /**\brief \b This \b is \b the \b SHORE \b Storage \b Manager \b API. 00615 *\details 00616 * Most of the API for using the storage manager is through this 00617 * interface class. 00618 */ 00619 class ss_m : public smlevel_top 00620 { 00621 friend class pin_i; 00622 friend class sort_stream_i; 00623 friend class prologue_rc_t; 00624 friend class log_entry; 00625 friend class coordinator; 00626 friend class tape_t; 00627 public: 00628 00629 typedef smlevel_0::LOG_WARN_CALLBACK_FUNC LOG_WARN_CALLBACK_FUNC; 00630 typedef smlevel_0::LOG_ARCHIVED_CALLBACK_FUNC LOG_ARCHIVED_CALLBACK_FUNC; 00631 typedef smlevel_0::ndx_t ndx_t; 00632 typedef smlevel_0::concurrency_t concurrency_t; 00633 typedef smlevel_1::xct_state_t xct_state_t; 00634 00635 typedef sm_store_property_t store_property_t; 00636 00637 #if COMMENT 00638 // 00639 // Below is most of the interface for the SHORE Storage Manager. 00640 // The rest is located in pin.h, scan.h, and smthread.h 00641 // 00642 00643 // 00644 // TEMPORARY FILES/INDEXES 00645 // 00646 // When a file or index is created there is a tmp_flag parameter 00647 // that when true indicates that the file is temporary. 00648 // Operations on a temporary file are not logged and the 00649 // file will be gone the next time the volume is mounted. 00650 // 00651 // TODO: IMPLEMENTATION NOTE on Temporary Files/Indexes: 00652 // Temp files cannot be trusted after transaction abort. 00653 // They should be marked for removal. 00654 // 00655 // CODE STRUCTURE: 00656 // Almost all ss_m functions begin by creating a prologue object 00657 // whose constructor and descructor check for many common errors. 00658 // In addition most ss_m::OP() functions now call an ss_m::_OP() 00659 // function to do the real work. The ss_m::OP functions should 00660 // not be called by other ss_m functions, instead the corresponding 00661 // ss_m::_OP function should be used. 00662 // 00663 00664 #endif /* COMMENT */ 00665 00666 public: 00667 /**\brief Add storage manager options to the given options group. 00668 *\ingroup SSMINIT 00669 *\details 00670 * @param[in] grp The caller's option group, to which the 00671 * storage manager's options will be added for processing soon. 00672 * 00673 * Before the ss_m constructor can be called, setup_options 00674 * \b must be called. This will install the storage manager's options and 00675 * initialize any that are not required. 00676 * Once all required options have been set, an ss_m can be constructed. 00677 * 00678 *\note This is not thread-safe. The application (server) must prevent 00679 * concurrent calls to setup_options. 00680 */ 00681 static rc_t setup_options(option_group_t* grp); 00682 00683 /**\brief Initialize the storage manager. 00684 * \ingroup SSMINIT 00685 * \details 00686 * @param[in] warn A callback function. This is called 00687 * when/if the log is in danger of becoming "too full". 00688 * @param[in] get A callback function. This is called 00689 * when the storage manager needs an archived log file to be restored. 00690 * 00691 * When an ss_m object is created, the storage manager initializes itself 00692 * and, 00693 * if the sthreads package has not already been initialized by virtue 00694 * of an sthread_t running, the sthreads package is initialized now. 00695 * 00696 * The log is read and recovery is performed (\ref MHLPS), 00697 * and control returns to 00698 * the caller, after which time 00699 * storage manager threads (instances of smthread_t) may be constructed and 00700 * storage manager may be used. 00701 * 00702 * The storage manager is used by invoking its static methods. 00703 * You may use them as follows: 00704 * \code 00705 * ss_m *UNIQ = new ss_m(); 00706 * 00707 * W_DO(UNIQ->mount_dev(...)) 00708 * // or 00709 * W_DO(ss_m::mount_dev(...)) 00710 * \endcode 00711 * ). 00712 * 00713 * Only one ss_m object may be extant at any time. If you try 00714 * to create another while the one exists, a fatal error will occur 00715 * (your program will choke with a message about your mistake). 00716 * 00717 * The callback argument given to the storage manager constructor 00718 * is called when the storage manager determines that it is in danger 00719 * of running out of log space. Heuristics are used to guess when 00720 * this is the case. 00721 * 00722 * If the function \a warn archives and removes log files, the function 00723 * \a get must be provided to restore those log files when the 00724 * storage manager needs them. 00725 * 00726 * For details and examples, see \ref smlevel_0::LOG_WARN_CALLBACK_FUNC, 00727 * \ref smlevel_0::LOG_ARCHIVED_CALLBACK_FUNC, and 00728 * \ref LOGSPACE. 00729 */ 00730 ss_m(LOG_WARN_CALLBACK_FUNC warn=NULL, LOG_ARCHIVED_CALLBACK_FUNC get=NULL); 00731 00732 /**\brief Shut down the storage manager. 00733 * \ingroup SSMINIT 00734 * \details 00735 * When the storage manager object is deleted, it shuts down. 00736 * Thereafter it is not usable until another ss_m object is 00737 * constructed. 00738 */ 00739 ~ss_m(); 00740 00741 /**\brief Cause the storage manager's shutting down do be done cleanly 00742 * or to simulate a crash. 00743 * \ingroup SSMINIT 00744 * \details 00745 * @param[in] clean True means shut down gracefully, false means simulate a crash. 00746 * 00747 * When the storage manager's destructor is called 00748 * the buffer pool is flushed to disk, unless this method is called 00749 * with \a clean == \e false. 00750 * 00751 * \note If this method is used, it 00752 * must be called after the storage manager is 00753 * constructed if it is to take effect. Each time the storage 00754 * manager is constructed, the state associated with this is set 00755 * to \e true, i.e., "shut down properly". 00756 * 00757 * \note This method is not thread-safe, only one thread should use this 00758 * at any time, presumably just before shutting down. 00759 */ 00760 static void set_shutdown_flag(bool clean); 00761 00762 /**\brief Notify storage manager when a log file was archived by a 00763 * LOG_WARN_CALLBACK_FUNC. 00764 * 00765 * The arguments: 00766 * @param[in] logfile Character string name of file archived. 00767 */ 00768 static rc_t log_file_was_archived(const char * logfile); 00769 00770 private: 00771 void _construct_once(LOG_WARN_CALLBACK_FUNC x=NULL, 00772 LOG_ARCHIVED_CALLBACK_FUNC y=NULL); 00773 void _destruct_once(); 00774 00775 00776 public: 00777 /**\addtogroup SSMXCT 00778 * 00779 * All work performed on behalf of a transaction must occur while that 00780 * transaction is "attached" to the thread that performs the work. 00781 * Creating a transaction attaches it to the thread that creates the transaction. 00782 * The thread may detach from the transaction and attach to another. 00783 * Multiple threads may attach to a single transaction and do work in certain circumstances. See \ref SSMMULTIXCT 00784 * 00785 * 00786 */ 00787 /**\brief Begin a transaction 00788 *\ingroup SSMXCT 00789 * @param[in] timeout Optional, controls blocking behavior. 00790 * \details 00791 * 00792 * Start a new transaction and "attach" it to this thread. 00793 * No running transaction may be attached to this thread. 00794 * 00795 * Storage manager methods that must block (e.g., to acquire a lock) 00796 * will use the timeout given. 00797 * The default timeout is the one associated with this thread. 00798 * 00799 * \sa timeout_in_ms 00800 */ 00801 static rc_t begin_xct( 00802 timeout_in_ms timeout = WAIT_SPECIFIED_BY_THREAD); 00803 00804 /**\brief Begin an instrumented transaction. 00805 *\ingroup SSMXCT 00806 * @param[in] stats Pointer to an allocated statistics-holding structure. 00807 * @param[in] timeout Optional, controls blocking behavior. 00808 * \details 00809 * No running transaction may be already attached to this thread. 00810 * A new transaction is started and attached to the running thread. 00811 * 00812 * The transaction will be instrumented. 00813 * This structure is updated by the storage manager whenever a thread 00814 * detaches from this transaction. The activity recorded during 00815 * the time the thread is attached to the transcation will be stored in 00816 * the per-transaction statistics. 00817 * \attention It is the client's 00818 * responsibility to delete the statistics-holding structure. 00819 * 00820 * Storage manager methods that must block (e.g., to acquire a lock) 00821 * will use the timeout given. 00822 * The default timeout is the one associated with this thread. 00823 * 00824 * \sa timeout_in_ms 00825 */ 00826 static rc_t begin_xct( 00827 sm_stats_info_t* stats, // allocated by caller 00828 timeout_in_ms timeout = WAIT_SPECIFIED_BY_THREAD); 00829 00830 /**\brief Begin a transaction and return the transaction id. 00831 *\ingroup SSMXCT 00832 * @param[out] tid Transaction id of new transaction. 00833 * @param[in] timeout Optional, controls blocking behavior. 00834 * \details 00835 * 00836 * No running transaction may be attached to this thread. 00837 * 00838 * Storage manager methods that must block (e.g., to acquire a lock) 00839 * will use the timeout given. 00840 * The default timeout is the one associated with this thread. 00841 * 00842 * \sa timeout_in_ms 00843 */ 00844 static rc_t begin_xct( 00845 tid_t& tid, 00846 timeout_in_ms timeout = WAIT_SPECIFIED_BY_THREAD); 00847 00848 /**\addtogroup SSM2PC 00849 * The storage manager contains support for externally-coordinated 00850 * transactions that use 00851 * two-phase-commit with presumed abort. 00852 * The server must provide the coordination and the coordinator is 00853 * assumed to have its own stable storage, and it is assumed to recover 00854 * from failures in a "short time", the precise meaning of which is given below. 00855 * A prepared transaction, like an active transaction, 00856 * consumes log space and holds locks. 00857 * Even if a prepared transaction does not hold locks needed by 00858 * other transactions, it consumes resources in a way that can interfere 00859 * with other transactions. 00860 * If a prepared transaction remains in the system for a long time 00861 * while other transactions are running, eventually the storage 00862 * manager needs the log space used (reserved) by the prepared transaction. 00863 * A coordinator must resolve its prepared transactions 00864 * before the storage manager effectively runs out of 00865 * log space for other transactions in the system. 00866 * The amount of time involved is a function of the size of the log 00867 * and of the demands of the other transactions in the system. 00868 * 00869 * For the purpose of this discussion, the portion of a global 00870 * transaction that involves a single Shore Storage Manager transaction is 00871 * called a thread of the global transaction. 00872 * 00873 * A Shore transaction participates as a thread of a global transaction 00874 * as follows: 00875 - Start a storage-manager transaction with ss_m::begin_xct. 00876 - Acquire a global transaction identifier from the coordinator. 00877 - Indicate to the storage manager that this transaction is a 00878 thread of a global transaction, and associate the global transaction 00879 identifier with this thread by calling ss_m::enter_2pc. 00880 - Associate a coordinator with the transaction for recovery 00881 purposes, by calling ss_m::set_coordinator. 00882 - Prepare the thread of the transaction and get the storage manager's 00883 vote with ss_m::prepare_xct. 00884 It is an error to commit a global transaction thread without first 00885 preparing it. It is an error to do anything else 00886 in a transaction after it is prepared, except to end 00887 the transaction or retry the prepare (to get the vote again). 00888 - Convey the vote to the coordinator, and determine the transaction's 00889 fate from the coordinator. 00890 - End the thread with ss_m::commit_xct or ss_m::abort_xct. 00891 * 00892 * The storage manager 00893 * logs the minimal information required to effect a vote of the 00894 * transaction threads that are storage manager transactions, 00895 * and to recover such in-doubt transactions after restart. 00896 * Thus, after a crash/restart, the server may query the storage manager 00897 * about in-doubt (prepared) transactions with ss_m::query_prepared_xct, 00898 * which tells the caller the number and global transaction IDs associated 00899 * with prepared transactions. 00900 * Using this, the server contacts the coordinator and resumes the 00901 * voting. 00902 * The server may find the local transaction IDs and use ss_m::tid_to_xct 00903 * to attach these transactions and to resolve them. 00904 * 00905 * Commit and abort of read-only transactions are the same, 00906 * as these transactions have no log entries. Preparing read-only transactions 00907 * causes them to commit/abort and the vote returned is vote_readonly. 00908 * Once this vote is communicated to the coordinator and the coordinator 00909 * records it on stable storage, there is no need to involve this thread in 00910 * any further processing. For this reason, 00911 * read-only transactions do not appear as prepared transactions at 00912 * recovery time. 00913 * 00914 */ 00915 00916 /**\brief Make the attached transaction a thread of a distributed transaction. 00917 *\ingroup SSM2PC 00918 * 00919 * @param[in] gtid Global transaction ID to associate with this transaction. This will be logged when the transaction is prepared. 00920 * 00921 * \note This can be called at most once for a given transaction. 00922 * The transaction must be attached to the calling thread. 00923 * No other threads may be attached to the transaction. 00924 */ 00925 static rc_t enter_2pc(const gtid_t >id); 00926 /**\brief Assign a coordinator handle to this distributed transaction. 00927 *\ingroup SSM2PC 00928 * @param[in] h Handle of the coordinator. Not interpreted by 00929 * the storage manager. 00930 * 00931 * The storage manager associates this server handle with the transaction 00932 * so that when the transaction is prepared, this information is 00933 * written to the log. Upon recovery, if this transaction is still in doubt, 00934 * the value-added server can query the 00935 * storage manager for in-doubt transactions, get their server handles, 00936 * and resolve the transactions. 00937 * See query_prepared_xct and recover_2pc. 00938 */ 00939 static rc_t set_coordinator(const server_handle_t &h); 00940 00941 /**\brief Prepare a thread of a distributed transaction. 00942 *\ingroup SSM2PC 00943 * @param[in] stats Pointer to an allocated statistics-holding 00944 * structure. 00945 * @param[out] vote This thread's vote. 00946 * 00947 * The storage manager will prepare the attached transaction (a thread 00948 * of a distributed transaction) for commit. 00949 * If this transaction has performed no logged updates, the 00950 * vote returned will be vote_readonly. 00951 * If this transaction can commit, the vote returned will be vote_commit. 00952 * If an error occurs during the prepare, the vote will be vote_abort. 00953 * 00954 * If the transaction is being instrumented, the 00955 * statistics-holding structure will be returned to the caller, 00956 * and the caller is responsible for its deallocation. 00957 */ 00958 static rc_t prepare_xct( 00959 sm_stats_info_t*& stats, 00960 vote_t& vote); 00961 00962 /**\brief Prepare a thread of a distributed transaction. 00963 *\ingroup SSM2PC 00964 * @param[out] vote This thread's vote. See \ref w_base_t::vote_t. 00965 * 00966 * The storage manager will prepare the attached transaction (a thread 00967 * of a distributed transaction) for commit. 00968 * If this transaction has performed no logged updates, the 00969 * vote returned will be vote_readonly. 00970 * If this transaction can commit, the vote returned will be vote_commit. 00971 * If an error occurs during the prepare, the vote will be vote_abort. 00972 */ 00973 static rc_t prepare_xct(vote_t &vote); 00974 00975 /**\brief Force the transaction to vote "read-only" in a two-phase commit. 00976 *\ingroup SSM2PC 00977 * \details 00978 * This will override the storage manager's determination of 00979 * whether this thread of a distributed transaction is read-only, which is 00980 * based on whether the local transaction thread logged anything. This 00981 * method may be useful if the local transaction rolled back to 00982 * a savepoint. 00983 * See \ref w_base_t::vote_t. 00984 */ 00985 static rc_t force_vote_readonly(); 00986 00987 /**\brief Given a global transaction id, find the local prepared 00988 * transaction associated with it. 00989 *\ingroup SSM2PC 00990 * @param[in] gtid A global transaction ID (an opaque quantity 00991 * to the storage manager). 00992 * @param[in] mayblock Not used. 00993 * @param[out] local Return the transaction ID of the prepared 00994 * SM transaction. 00995 * \details 00996 * Searches the transaction list for a prepared transaction with the given 00997 * global transaction id. If found, it returns a reference to the 00998 * local transaction. The transaction is attached to the running 00999 * thread before it is returned. 01000 */ 01001 static rc_t recover_2pc(const gtid_t & gtid, 01002 bool mayblock, 01003 tid_t & local 01004 ); 01005 01006 /**\brief Return the number of prepared transactions. 01007 *\ingroup SSM2PC 01008 * @param[out] numtids The number of in-doubt transactions. 01009 * \details 01010 * Used by a server at start-up, after recovery, to find out if 01011 * there are any in-doubt transactions. If so, the server must 01012 * use the second form of query_prepared_xct to find the global 01013 * transaction IDs of these in-doubt transactions. 01014 */ 01015 static rc_t query_prepared_xct(int &numtids); 01016 01017 /**\brief Return the global transaction IDs of in-doubt transactions. 01018 *\ingroup SSM2PC 01019 * @param[in] numtids The number of global transaction ids in the list. 01020 * @param[in] l The caller-provided list into which to write the 01021 * global transaction-ids. 01022 * \details 01023 * Used by a server at start-up, after recovery, to find out the 01024 * global transaction IDs of the prepared transactions. The storage 01025 * manager fills in the first numtids entries of the pre-allocated list. 01026 * The server may have first called the first form of query_prepared_xct 01027 * to find out how many such transactions there are after recovery. 01028 * 01029 * \attention Read-only transactions 01030 * do not appear as in-doubt transactions. Because they did not 01031 * generate any log records, they will not be "discovered" by analysis. 01032 * The server must determine that any thread of a global transaction that 01033 * does not appear to be in doubt was a read-only thread or 01034 * it never prepared and thus has been aborted. 01035 * Read-only transactions that were prepared would have voted read-only, 01036 * and if the coordinator recorded that vote on stable storage, it 01037 * should not be concerned with these transaction threads any further. 01038 * If the coordinator does not have this information recorded, the 01039 * transaction thread could have been an aborted non-read-only transaction, 01040 * so the coordinator must, in this case, presume that the thread aborted 01041 * and thus make the global transaction abort. 01042 */ 01043 static rc_t query_prepared_xct(int numtids, gtid_t l[]); 01044 01045 01046 /**\brief Commit a transaction. 01047 *\ingroup SSMXCT 01048 * @param[in] lazy Optional, controls flushing of log. 01049 * @param[out] plastlsn If non-null, this is a pointer to a 01050 * log sequence number into which the storage 01051 * manager writes the that of the last log record 01052 * inserted for this transaction. 01053 * \details 01054 * 01055 * Commit the attached transaction and detach it, destroy it. 01056 * If \a lazy is true, the log is not synced. This means that 01057 * recovery of this transaction might not be possible. 01058 */ 01059 static rc_t commit_xct( 01060 bool lazy = false, 01061 lsn_t* plastlsn=NULL); 01062 01063 /**\brief Commit an instrumented transaction and get its statistics. 01064 *\ingroup SSMXCT 01065 * @param[out] stats Get a copy of the statistics for this transaction. 01066 * @param[in] lazy Optional, controls flushing of log. 01067 * @param[out] plastlsn If non-null, this is a pointer to a 01068 * log sequence number into which the storage 01069 * manager writes the that of the last log record 01070 * inserted for this transaction. 01071 * \details 01072 * 01073 * Commit the attached transaction and detach it, destroy it. 01074 * If \a lazy is true, the log is not synced. This means that 01075 * recovery of this transaction might not be possible. 01076 */ 01077 static rc_t commit_xct( 01078 sm_stats_info_t*& stats, 01079 bool lazy = false, 01080 lsn_t* plastlsn=NULL); 01081 01082 /**\brief Commit an instrumented transaction and start a new one. 01083 *\ingroup SSMXCT 01084 * @param[out] stats Get a copy of the statistics for the first transaction. 01085 * @param[in] lazy Optional, controls flushing of log. 01086 * \details 01087 * 01088 * Commit the attached transaction and detach it, destroy it. 01089 * Start a new transaction and attach it to this thread. 01090 * \note \e The \e new 01091 * \e transaction \e inherits \e the \e locks \e of \e the \e old 01092 * \e transaction. 01093 * 01094 * If \a lazy is true, the log is not synced. This means that 01095 * recovery of this transaction might not be possible. 01096 */ 01097 static rc_t chain_xct( 01098 sm_stats_info_t*& stats, /* in w/new, out w/old */ 01099 bool lazy = false); 01100 01101 /**\brief Commit a transaction and start a new one, inheriting locks. 01102 *\ingroup SSMXCT 01103 * @param[in] lazy Optional, controls flushing of log. 01104 * \details 01105 * 01106 * Commit the attached transaction and detach it, destroy it. 01107 * Start a new transaction and attach it to this thread. 01108 * \note \e The \e new 01109 * \e transaction \e inherits \e the \e locks \e of \e the \e old 01110 * \e transaction. 01111 * 01112 * If \a lazy is true, the log is not synced. This means that 01113 * recovery of the committed transaction might not be possible. 01114 */ 01115 static rc_t chain_xct(bool lazy = false); 01116 01117 /**\brief Abort an instrumented transaction and get its statistics. 01118 *\ingroup SSMXCT 01119 * @param[out] stats Get a copy of the statistics for this transaction. 01120 * \details 01121 * 01122 * Abort the attached transaction and detach it, destroy it. 01123 */ 01124 static rc_t abort_xct(sm_stats_info_t*& stats); 01125 /**\brief Abort a transaction. 01126 *\ingroup SSMXCT 01127 * \details 01128 * 01129 * Abort the attached transaction and detach it, destroy it. 01130 */ 01131 static rc_t abort_xct(); 01132 01133 /**\brief Populate a save point. 01134 *\ingroup SSMSP 01135 * @param[out] sp An sm_save_point_t owned by the caller. 01136 *\details 01137 * Store in sp the needed information to be able to roll back 01138 * to this point. 01139 * For use with rollback_work. 01140 * \note Only one thread may be attached to a transaction when this 01141 * is called. 01142 */ 01143 static rc_t save_work(sm_save_point_t& sp); 01144 01145 /**\brief Roll back to a savepoint. 01146 *\ingroup SSMSP 01147 * @param[in] sp An sm_save_point_t owned by the caller and 01148 * populated by save_work. 01149 *\details 01150 * Undo everything that was 01151 * done from the time save_work was called on this savepoint. 01152 * \note Locks are not freed. 01153 * 01154 * \note Only one thread may be attached to a transaction when this 01155 * is called. 01156 */ 01157 static rc_t rollback_work(const sm_save_point_t& sp); 01158 01159 /**\brief Return the number of transactions in active state. 01160 *\ingroup SSMXCT 01161 * \details 01162 * While this is thread-safe, the moment a value is returned, it could 01163 * be out of date. 01164 * Useful only for debugging. 01165 */ 01166 static w_base_t::uint4_t num_active_xcts(); 01167 01168 /**\brief Attach the given transaction to the currently-running smthread_t. 01169 *\ingroup SSMXCT 01170 * \details 01171 * It is assumed that the currently running thread is an smthread_t. 01172 */ 01173 static void attach_xct(xct_t *x) { me()->attach_xct(x); } 01174 01175 /**\addtogroup SSMMULTIXCT 01176 * 01177 * Certain operations may be performed while more than one 01178 * thread is attached to a transaction (this functionality is 01179 * soon to be deprecated). 01180 * Any number of attached threads may be read-only. 01181 * The kinds of updates that can be made by multiple threads are limited by 01182 * the need to avoid latch-mutex and latch-latch deadlocks. 01183 * 01184 * There are several reasons for this. 01185 * 1) The multiple threads are not protected from each other by locks. 01186 * 2) Interleaving of top-level actions is not supported with rollback; 01187 * this means that for the duration of a top-level action, a thread needs 01188 * access to the log that excludes all other threads in 01189 * the same transaction. 01190 * 01191 * The internal logging protocol is this: 01192 * T1: latch page, log update. Logging requires acquiring a mutex 01193 * on the xct's log buffer. 01194 * T2: performing any top-level action, acquires the mutex on the 01195 * xct's log buffer before doing the action (latching the page). 01196 * 01197 * Thus, anything involving top-level actions is suspect. B-trees 01198 * use top-level actions, as does file-page allocation, and creation/ 01199 * destruction of stores (files, indexes). Thus, just about 01200 * any kind of concurrent updates on the same page 01201 * in the same transaction is problematic, and just about any update 01202 * can result in latching extent-map or store-map pages. 01203 * This activity could be disallowed by enforcing a strict 01204 * rule that at most one update operation can be going on 01205 * in a transaction at any time, however this is too restrictive. 01206 * 01207 * Multiple updating threads can 01208 * work \b if \b the \b data \b are \b partitioned by volume. 01209 * So a well-behaved server may use multiple-threaded transactions 01210 * to do updates as long as the updates are on different \b volumes. 01211 * It might also allow read-only transaction threads to be 01212 * concurrent with a single updating thread. 01213 * 01214 * Savepoints and partial rollback may \e not be used with 01215 * multi-threaded transactions. This is not enforced by the storage 01216 * manager; it is poor behavior on the part of a server. 01217 * For example, the behavior of the following is undefined: 01218 * - thread 1: attach, read, read, read, ... 01219 * - thread 2: attach, save work, update, rollback 01220 * If the two threads are reading and possibly updating the same 01221 * data, the results are timing-dependent and could produce a latch- 01222 * latch or latch-mutex deadlock. 01223 * 01224 * Ongoing research at DIAS is investigating ways to extend the usefulness 01225 * of parallelism within a transaction (multi-threaded transactions). 01226 * Current thoughts about this are for servers to coordinate multiple 01227 * transactions using two-phase commit or an optimized version 01228 * of commit and abort for groups of local transactions. 01229 */ 01230 01231 /**\brief Detach any attached from the currently-running smthread_t. 01232 *\ingroup SSMXCT 01233 * \details 01234 * Sever the connection between the running thread and the transaction. 01235 * This allow the running thread to attach a different 01236 * transaction and to perform work in its behalf. 01237 */ 01238 static void detach_xct() { xct_t *x = me()->xct(); 01239 if(x) me()->detach_xct(x); } 01240 01241 /**\brief Get the transaction structure for a given a transaction id. 01242 *\ingroup SSMXCT 01243 * @param[in] tid Transaction ID. 01244 *\details 01245 * Return a pointer to the storage manager's transaction structure. 01246 * Can be used with detach_xct and attach_xct. 01247 */ 01248 static xct_t* tid_to_xct(const tid_t& tid); 01249 /**\brief Get the transaction ID for a given a transaction structure. 01250 *\ingroup SSMXCT 01251 * @param[in] x Pointer to transaction structure. 01252 *\details 01253 * Return the transaction ID for the given transaction. 01254 */ 01255 static tid_t xct_to_tid(const xct_t* x); 01256 01257 /**\brief Print transaction information to an output stream. 01258 *\ingroup SSMAPIDEBUG 01259 * @param[in] o Stream to which to write the information. 01260 * \details 01261 * This is for debugging only, and is not thread-safe. 01262 */ 01263 static rc_t dump_xcts(ostream &o); 01264 01265 /**\brief Get the transaction state for a given transaction (structure). 01266 *\ingroup SSMXCT 01267 * @param[in] x Pointer to transaction structure. 01268 * \details 01269 * Returns the state of the transaction (active, prepared). It is 01270 * hard to get the state of an aborted or committed transaction, since 01271 * their structures no longer exist. 01272 */ 01273 static xct_state_t state_xct(const xct_t* x); 01274 01275 /**\brief Return the amount of log this transaction would consume 01276 * if it rolled back. 01277 *\ingroup SSMXCT 01278 * 01279 * If a transaction aborts with eOUTOFLOGSPACE this function can 01280 * be used in conjunction with xct_reserve_log_space to 01281 * pre-allocate the needed amount of log space before retrying. 01282 */ 01283 static smlevel_0::fileoff_t xct_log_space_needed(); 01284 01285 /**\brief Require the specified amount of log space to be 01286 * available for this transaction before continuing. 01287 *\ingroup SSMXCT 01288 * 01289 * If a transaction risks running out of log space it can 01290 * pre-request some or all of the needed amount before starting in 01291 * order to improve its chances of success. Other new transactions 01292 * will be unable to acquire log space before this request is 01293 * granted (existing ones will be able to commit, unless they also 01294 * run out of space, because that tends to free up log space and 01295 * avoids wasting work). 01296 */ 01297 static rc_t xct_reserve_log_space(fileoff_t amt); 01298 01299 /**\brief Get the locking granularity for the attached transaction. 01300 * \ingroup SSMLOCK 01301 */ 01302 static concurrency_t xct_lock_level(); 01303 /**\brief Set the default locking level for the attached transaction. 01304 * \ingroup SSMLOCK 01305 * \details 01306 * @param[in] l The level to use for the balance of this transaction. 01307 * Legitimate values are t_cc_record, t_cc_page, t_cc_file. 01308 * 01309 * \note Only one thread may be attached to the transaction when this 01310 * is called. If more than one thread is attached, a fatal error 01311 * will ensue. 01312 */ 01313 static void set_xct_lock_level(concurrency_t l); 01314 01315 /**\brief Collect transaction information in a virtual table. 01316 * \ingroup SSMVTABLE 01317 * \details 01318 * @param[out] v The virtual table to populate. 01319 * @param[in] names_too If true, make the 01320 * first row of the table a list of the attribute names. 01321 * 01322 * All attribute values will be strings. 01323 * The virtual table v can be printed with its output operator 01324 * operator<< for ostreams. 01325 * 01326 * \attention Not atomic. Can yield stale data. 01327 */ 01328 static rc_t xct_collect(vtable_t&v, bool names_too=true); 01329 01330 /**\brief Collect buffer pool information in a virtual table. 01331 * \ingroup SSMVTABLE 01332 * \details 01333 * @param[out] v The virtual table to populate. 01334 * @param[in] names_too If true, make the 01335 * first row of the table a list of the attribute names. 01336 * 01337 * \attention Be wary of using this with a large buffer pool. 01338 * 01339 * All attribute values will be strings. 01340 * The virtual table v can be printed with its output operator 01341 * operator<< for ostreams. 01342 * 01343 * \attention Not atomic. Can yield stale data. 01344 */ 01345 static rc_t bp_collect(vtable_t&v, bool names_too=true); 01346 01347 /**\brief Collect lock table information in a virtual table. 01348 * \ingroup SSMVTABLE 01349 * \details 01350 * @param[out] v The virtual table to populate. 01351 * @param[in] names_too If true, make the 01352 * first row of the table a list of the attribute names. 01353 * 01354 * All attribute values will be strings. 01355 * The virtual table v can be printed with its output operator 01356 * operator<< for ostreams. 01357 * 01358 * \attention Not atomic. Can yield stale data. 01359 * Cannot be used in a multi-threaded-transaction context. 01360 */ 01361 static rc_t lock_collect(vtable_t&v, bool names_too=true); 01362 01363 /**\brief Collect thread information in a virtual table. 01364 * \ingroup SSMVTABLE 01365 * \details 01366 * @param[out] v The virtual table to populate. 01367 * @param[in] names_too If true, make the 01368 * first row of the table a list of the attribute names. 01369 * 01370 * All attribute values will be strings. 01371 * The virtual table v can be printed with its output operator 01372 * operator<< for ostreams. 01373 * 01374 * \attention Not thread-safe. Can yield stale data. 01375 */ 01376 static rc_t thread_collect(vtable_t&v, bool names_too=true); 01377 01378 /**\brief Take a checkpoint. 01379 * \ingroup SSMAPIDEBUG 01380 * \note For debugging only! 01381 * 01382 * Force the storage manager to take a checkpoint. 01383 * Checkpoints are fuzzy : they can be taken while most other 01384 * storage manager activity is happening, even though they have 01385 * to be serialized with respect to each other, and with respect to 01386 * a few other activities. 01387 * 01388 * This is thread-safe. 01389 */ 01390 static rc_t checkpoint(); 01391 01392 /**\brief Force the buffer pool to flush its pages to disk. 01393 * \ingroup SSMAPIDEBUG 01394 * @param[in] invalidate True means discard pages after flush. 01395 * \note For debugging only! 01396 * \attention Do not call force_buffers with anything pinned. 01397 * You may cause latch-latch deadlocks, as this method has 01398 * to scan the entire buffer pool and possibly EX-latch pages to prevent 01399 * others from updating while it forces to disk. 01400 * Since the page-order is essentially random, we cannot 01401 * preclude latch-latch deadlocks with other threads. 01402 */ 01403 static rc_t force_buffers(bool invalidate = false); 01404 01405 /**\brief Force the buffer pool to flush the volume header page(s) 01406 * to disk. 01407 * \ingroup SSMAPIDEBUG 01408 * @param[in] vid ID of the volume of interest 01409 * \note For debugging only! 01410 * \attention Do not call force_vol_hdr_buffers with anything pinned. 01411 * You could cause latch-latch deadlocks, as this method has 01412 * to scan the entire buffer pool and possibly EX-latch some pages. 01413 * Since the page-order is essentially random, we cannot 01414 * preclude latch-latch deadlocks with other threads. 01415 */ 01416 static rc_t force_vol_hdr_buffers( const vid_t& vid); 01417 01418 /**\brief Force the buffer pool to flush to disk all pages 01419 * for the given store. 01420 * \ingroup SSMAPIDEBUG 01421 * @param[in] stid Store whose pages are to be flushed. 01422 * @param[in] invalidate True means discard the pages after flushing. 01423 * \note For debugging only! 01424 * \attention Do not call force_store_buffers with anything pinned. 01425 * You may cause latch-latch deadlocks, as this method has 01426 * to scan the entire buffer pool and, if invalide==true, 01427 * EX-latch pages to prevent others from updating 01428 * while it forces to disk. 01429 * Since the page-order is essentially random, we cannot 01430 * preclude latch-latch deadlocks with other threads. 01431 */ 01432 static rc_t force_store_buffers(const stid_t & stid, 01433 bool invalidate); 01434 01435 /**\cond skip 01436 * Do not document. Very un-thread-safe. 01437 */ 01438 static rc_t dump_buffers(ostream &o); 01439 static rc_t dump_locks(ostream &o); 01440 static rc_t dump_locks(); // defaults to std::cout 01441 static rc_t dump_exts(ostream &o, 01442 vid_t v, 01443 extnum_t start, 01444 extnum_t end); 01445 01446 static rc_t dump_stores(ostream &o, 01447 vid_t v, 01448 int start, 01449 int end); 01450 01451 static rc_t dump_histo(ostream &o, bool locked); 01452 01453 static rc_t snapshot_buffers( 01454 u_int& ndirty, 01455 u_int& nclean, 01456 u_int& nfree, 01457 u_int& nfixed); 01458 /**\endcond skip */ 01459 01460 /**\brief Get a copy of the statistics from an attached instrumented transaction. 01461 * \ingroup SSMXCT 01462 * \details 01463 * @param[out] stats Returns a copy of the statistics for this transaction. 01464 * @param[in] reset If true, the statistics for this transaction will be zeroed. 01465 */ 01466 static rc_t gather_xct_stats( 01467 sm_stats_info_t& stats, 01468 bool reset = false); 01469 01470 /**\brief Get a copy of the global statistics. 01471 * \ingroup SSMSTATS 01472 * \details 01473 * @param[out] stats A pre-allocated structure. 01474 */ 01475 static rc_t gather_stats( 01476 sm_stats_info_t& stats 01477 ); 01478 01479 /**\brief Get a copy of configuration-dependent information. 01480 * \ingroup OPT 01481 * \details 01482 * @param[out] info A pre-allocated structure. 01483 */ 01484 static rc_t config_info(sm_config_info_t& info); 01485 01486 /**\brief Set sleep time before I/O operations. 01487 * \ingroup SSMVOL 01488 * \details 01489 * This method sets a milli_sec delay to occur before 01490 * each disk read/write operation. This is for debugging. 01491 * It is useful in discovering thread sync bugs. 01492 * This delay applies to all threads. 01493 */ 01494 static rc_t set_disk_delay(u_int milli_sec); 01495 01496 /**\cond skip */ 01497 // TODO : document crash testing facilities 01498 /**\brief Simulate a crash 01499 * \details 01500 * This method tells the log manager to start generating corrupted 01501 * log records. This will make it appear that a crash occurred 01502 * at that point in the log. A call to this method should be 01503 * followed immediately by a dirty shutdown of the ssm. 01504 */ 01505 static rc_t start_log_corruption(); 01506 /**\endcond skip */ 01507 01508 // Forces a log flush 01509 static rc_t sync_log(bool block=true); 01510 static rc_t flush_until(lsn_t& anlsn, bool block=true); 01511 01512 // Allowing to access info about the important lsns (curr and durable) 01513 static rc_t get_curr_lsn(lsn_t& anlsn); 01514 static rc_t get_durable_lsn(lsn_t& anlsn); 01515 01516 01517 /* 01518 Device and Volume Management 01519 ---------------------------- 01520 A device is either an operating system file or operating system 01521 device and is identified by a path name (absolute or relative). 01522 A device has a quota. In theory, a device may have 01523 multiple volumes on it but 01524 in the current implementation the maximum number of volumes 01525 is 1. 01526 01527 A volume is where data is stored. A volume is identified 01528 uniquely and persistently by a long volume ID (lvid_t). 01529 Volumes can be used whenever the device they are located 01530 on is mounted by the SM. Volumes have a quota. The 01531 sum of the quotas of all the volumes on a device cannot 01532 exceed the device quota. 01533 01534 The basic steps to begin using a new device/volume are: 01535 format_dev: initialize the device 01536 mount_dev: allow use of the device and all its volumes 01537 generate_new_lvid: generate a unique ID for the volume 01538 create_vol: create a volume on the device 01539 */ 01540 01541 /* 01542 * Device management functions 01543 */ 01544 /**\addtogroup SSMVOL 01545 * The storage manager was designed to permit multiple \e volumes 01546 * on a \e device, with \e volume analogous to a Unix \e parition and 01547 * a \e device analogous to a disk, and the original SHORE contained 01548 * symmetric peer servers. 01549 * However good that intention, multiple volumes on a device were never 01550 * implemented, and times have changed, and the storage manager no 01551 * longer has any notion of remote and local volumes. 01552 * The notion a volume, separate from a device, remains, but may 01553 * some day disappear. 01554 * 01555 * For the time being, a device contains at most one volume. 01556 * 01557 * A device is either an operating system file or 01558 * an operating system device (e.g., raw disk partition) and 01559 * is identified by a path name (absolute or relative). 01560 * 01561 * A device has a quota. 01562 * A device is intended to have multiple volumes on it, but 01563 * in the current implementation the maximum number of volumes 01564 * is exactly 1. 01565 * 01566 * A volume is where data are stored. 01567 * Each volume is a header and a set of pages. All pages are 01568 * the same size (this is a compile-time constant, the default being 01569 * 8K and sizes up to 64K permissible). 01570 * 01571 * A volume is identified uniquely and persistently by a 01572 * long volume ID (lvid_t), which is stored in its header. 01573 * Volumes can be used whenever the device they are located 01574 * on is mounted by the SM. 01575 * Volumes have a quota. The 01576 * sum of the quotas of all the volumes on a device cannot 01577 * exceed the device quota. 01578 * 01579 * A volume contains a variety of data structures. All user 01580 * data reside in \e stores. A store is a collection of the 01581 * pages on the volume, allocated in \e extents of a size that 01582 * is a compile-time constant. (The storage manager has only 01583 * been tested with an extent-size of 8 pages. The compile-time constant 01584 * can be changed, but it also requires changes elsewhere in the code 01585 * to maintain alignment of persistent structures. 01586 * See the comments in config/shore.def.) Thus, the minimum size 01587 * of a store is one extent's worth of pages. 01588 * Larger extents provide better clustering, but more wasted space if 01589 * small files and small indexes will be common. 01590 * 01591 * Stores are identified by a store number (snum_t). 01592 * 01593 * Each volume contains a few stores that are "overhead": 01594 * 0 -- is reserved for an extent map and a store map 01595 * 1 -- directory (dir_m) 01596 * 2 -- root index 01597 * 01598 * Beyond that, for each (user) file created, 2 stores are used, one for 01599 * small objects, one for large objects, and for each index (btree, rtree) 01600 * created 1 store is used. 01601 * 01602 * Each volume is laid out thus: 01603 * - volume header, which identifies the number of extents on 01604 * the volume, determined when the volume is formatted. 01605 * This is always in page 1 of the volume. 01606 * - store map: some number of pages describing the stores on the volume, 01607 * namely, being the heads of linked-lists of extents that make up 01608 * the stores. The number of such pages is determined when the 01609 * volume is formatted. The worst case is assumed, which is one 01610 * might fill the volume with one-extent stores. 01611 * - extent map: some number of pages of bitmaps, one bitmap for each 01612 * extent, describe which pages in the extents are allocated or free. 01613 * - data pages: the rest of the volume. 01614 * 01615 */ 01616 01617 /**\brief Format a device. 01618 * \ingroup SSMVOL 01619 * \details 01620 * @param[in] device Operating-system file name of the "device". 01621 * @param[in] quota_in_KB Quota in kilobytes. 01622 * @param[in] force If true, format the device even if it already exists. 01623 * 01624 * Since raw devices always "exist", \a force should be given as true 01625 * for raw devices. 01626 * 01627 * A device may not be formatted if it is already mounted. 01628 * 01629 * \note This method should \b not 01630 * be called in the context of a transaction. 01631 */ 01632 static rc_t format_dev( 01633 const char* device, 01634 smksize_t quota_in_KB, 01635 bool force); 01636 01637 /**\brief Mount a device. 01638 * \ingroup SSMVOL 01639 * \details 01640 * @param[in] device Operating-system file name of the "device". 01641 * @param[out] vol_cnt Number of volumes on the device. 01642 * @param[out] devid A local device id assigned by the storage manager. 01643 * @param[in] local_vid A local handle to the (only) volume on the device, 01644 * to be used when a volume is mounted. The default, vid_t::null, 01645 * indicates that the storage manager can chose a value for this. 01646 * 01647 * \note It is fine to mount a device more than once, as long as device 01648 * is always the same (you cannot specify a hard link or soft link to 01649 * an entity mounted under a different path). 01650 * Device mounts are \b not reference-counted, so a single dismount_dev 01651 * renders the volumes on the device unusable. 01652 * 01653 * \note This method should \b not 01654 * be called in the context of a transaction. 01655 */ 01656 static rc_t mount_dev( 01657 const char* device, 01658 u_int& vol_cnt, 01659 devid_t& devid, 01660 vid_t local_vid = vid_t::null); 01661 01662 /**\brief Dismount a device. 01663 * \ingroup SSMVOL 01664 * \details 01665 * @param[in] device Operating-system file name of the "device". 01666 * 01667 * \note It is fine to mount a device more than once, as long as device 01668 * is always the same (you cannot specify a hard link or soft link to 01669 * an entity mounted under a different path). 01670 * Device mounts are \b not reference-counted, so a single dismount_dev 01671 * renders the volumes on the device unusable. 01672 * 01673 * \note This method should \b not 01674 * be called in the context of a transaction. 01675 */ 01676 01677 static rc_t dismount_dev(const char* device); 01678 01679 /**\brief Dismount all mounted devices. 01680 * \ingroup SSMVOL 01681 * 01682 * \note This method should \b not 01683 * be called in the context of a transaction. 01684 */ 01685 static rc_t dismount_all(); 01686 01687 // list_devices returns an array of char* pointers to the names of 01688 // all mounted devices. Note that the use of a char*'s is 01689 // a temporary hack until a standard string class is available. 01690 // the char* pointers are pointing directly into the device 01691 // mount table. 01692 // dev_cnt is the length of the list returned. 01693 // dev_list and devid_list must be deleted with delete [] by the 01694 // caller if they are not null (0). They should be null 01695 // if an error is returned or if there are no devices. 01696 /**\brief Return a list of all mounted devices. 01697 * \ingroup SSMVOL 01698 * \details 01699 * @param[out] dev_list Returned list of pointers directly into the mount table. 01700 * @param[out] devid_list Returned list of associated device ids. 01701 * @param[out] dev_cnt Returned number of entries in the two above lists. 01702 * 01703 * The storage manager allocates the arrays returned with new[], and the 01704 * caller must return these to the heap with delete[] if they are not null. 01705 * They will be null if an error is returned or if no devices are mounted. 01706 * 01707 * The strings to which dev_list[*] point are \b not to be deleted by 01708 * the caller. 01709 */ 01710 static rc_t list_devices( 01711 const char**& dev_list, 01712 devid_t*& devid_list, 01713 u_int& dev_cnt); 01714 01715 /**\brief Return a list of all volume on a device. 01716 * \ingroup SSMVOL 01717 * \details 01718 * @param[in] device Operating-system file name of the "device". 01719 * @param[out] lvid_list Returned list of pointers directly into the mount table. 01720 * @param[out] lvid_cnt Returned length of list lvid_list. 01721 * 01722 * The storage manager allocates the array lvid_list 01723 * with new[], and the 01724 * caller must return it to the heap with delete[] if it is not null. 01725 * It will be null if an error is returned. 01726 * 01727 * \note This method should \b not 01728 * be called in the context of a transaction. 01729 */ 01730 static rc_t list_volumes( 01731 const char* device, 01732 lvid_t*& lvid_list, 01733 u_int& lvid_cnt 01734 ); 01735 01736 // get_device_quota the "quota" (in KB) of the device 01737 // and the amount of the quota allocated to volumes on the device. 01738 /**\brief Get the device quota. 01739 * \ingroup SSMVOL 01740 * \details 01741 * @param[in] device Operating-system file name of the "device". 01742 * @param[out] quota_KB Returned quota in kilobytes 01743 * @param[out] quota_used_KB Returned portion of quota allocated to volumes 01744 * 01745 * The quota_used_KB is the portion of the quota allocated to volumes on the device. 01746 * 01747 * \note This method \b may 01748 * be called in the context of a transaction. 01749 * 01750 * \note This method \b may 01751 * be called in the context of a transaction. 01752 */ 01753 static rc_t get_device_quota( 01754 const char* device, 01755 smksize_t& quota_KB, 01756 smksize_t& quota_used_KB); 01757 01758 01759 /* 01760 * Volume management functions 01761 */ 01762 01763 /**\brief Change the fake disk latency before I/Os on this volume, 01764 * for debugging purposes 01765 * \ingroup SSMVOL 01766 * \details 01767 * @param[in] vid The ID of the volume of interest. 01768 * @param[in] adelay Nanoseconds to sleep with ::nanosleep() 01769 * 01770 * This is for debugging only. 01771 * Changing the value of the latency for a volume does not enable the 01772 * delay. 01773 */ 01774 static rc_t set_fake_disk_latency(vid_t vid, const int adelay); 01775 01776 /**\brief Enable the fake disk latency before I/Os on this volume, for debugging purposes 01777 * \ingroup SSMVOL 01778 * \details 01779 * @param[in] vid The ID of the volume of interest. 01780 * 01781 * This is for debugging only. 01782 * When this is enabled, is uses whatever disk latency was set with 01783 * ss_m::create_vol() or the last applied ss_m::set_fake_disk_latency(). 01784 */ 01785 static rc_t enable_fake_disk_latency(vid_t vid); 01786 /**\brief Disable the fake disk latency before I/Os on this volume, for debugging purposes 01787 * \ingroup SSMVOL 01788 * \details 01789 * @param[in] vid The ID of the volume of interest. 01790 * 01791 * This is for debugging only. 01792 */ 01793 static rc_t disable_fake_disk_latency(vid_t vid); 01794 01795 01796 /**\brief Add a volume to a device. 01797 * \ingroup SSMVOL 01798 * \details 01799 * @param[in] lvid Long volume id to be used on ss_m::create_vol(). 01800 * 01801 * This generates a unique volume identifier to be written persistently 01802 * on the volume when it is formatted. 01803 * This enables us to avoid the mistake of doubly-mounting a volume. 01804 * The identifer is constructed from the machine network address and the 01805 * time of day. 01806 */ 01807 static rc_t generate_new_lvid(lvid_t& lvid); 01808 01809 /**\brief Add a volume to a device. 01810 * \ingroup SSMVOL 01811 * \details 01812 * @param[in] device_name Operating-system file name of the "device". 01813 * @param[in] lvid Long volume id to use when formatting the new volume. 01814 * @param[in] quota_KB Quota in kilobytes. 01815 * @param[in] skip_raw_init Do not initialize the volume if on a raw device. 01816 * @param[in] local_vid Short volume id by which to refer to this volume. 01817 * If null, the storage manager will assign one. 01818 * @param[in] apply_fake_io_latency See ss_m::enable_fake_disk_latency() 01819 * @param[in] fake_disk_latency See ss_m::set_fake_disk_latency() 01820 * 01821 * \note This method should \b not 01822 * be called in the context of a transaction. 01823 * 01824 * The pages on the volume \b must be zeroed; you can only use 01825 * \a skip_raw_init = true if you have by some other means 01826 * already initialized the volume. 01827 */ 01828 static rc_t create_vol( 01829 const char* device_name, 01830 const lvid_t& lvid, 01831 smksize_t quota_KB, 01832 bool skip_raw_init = false, 01833 vid_t local_vid = vid_t::null, 01834 const bool apply_fake_io_latency = false, 01835 const int fake_disk_latency = 0); 01836 01837 /**\brief Destroy a volume. 01838 * \ingroup SSMVOL 01839 * \details 01840 * @param[in] lvid Long volume id by which the volume is known. 01841 * 01842 * \note This method should \b not 01843 * be called in the context of a transaction. 01844 */ 01845 static rc_t destroy_vol(const lvid_t& lvid); 01846 01847 /**\brief Gets the quotas associated with the volume. 01848 * \ingroup SSMVOL 01849 * @param[in] lvid Long volume id by which the volume is known. 01850 * @param[out] quota_KB Quota given when the volume was created. 01851 * @param[out] quota_used_KB Portion of the quota has been used by 01852 * allocated extents. 01853 */ 01854 static rc_t get_volume_quota( 01855 const lvid_t& lvid, 01856 smksize_t& quota_KB, 01857 smksize_t& quota_used_KB); 01858 01859 /**\cond skip */ 01860 // check_volume_page_types: strictly for debugging/testing 01861 static rc_t check_volume_page_types(vid_t vid); 01862 /**\endcond skip */ 01863 01864 01865 /**\brief Analyze a volume and report statistics regarding disk usage. 01866 * \ingroup SSMVOL 01867 * @param[in] vid The volume of interest. 01868 * @param[out] du The structure that will hold the collected statistics. 01869 * @param[in] audit If "true", the method acquires a share lock on the 01870 * volume and then will check assertions about the 01871 * correctness of the data structures on the volume. 01872 * If the audit fails an internal fatal error is generated 01873 * to facilitate debugging. (It will generate a core file if your 01874 * shell permits such.) 01875 * If "false" an IS lock is acquired, which means that the 01876 * statistics will be fuzzy. 01877 * 01878 * Using the audit feature is useful for debugging. 01879 * It is the only safe way to use this method. 01880 * \note The statistics are added to the sm_du_stats_t structure passed in. 01881 * This structure is not cleared by the storage manager. 01882 */ 01883 static rc_t get_du_statistics( 01884 vid_t vid, 01885 sm_du_stats_t& du, 01886 bool audit = true); 01887 01888 /**\brief Analyze a store and report statistics regarding disk usage. 01889 * \ingroup SSMVOL 01890 * @param[in] stid The store of interest. 01891 * @param[out] du The structure that will hold the collected statistics. 01892 * @param[in] audit If "true", the method acquires a share lock on the 01893 * store and then will check assertions about the 01894 * correctness of the data structures on the store. 01895 * 01896 * Using the audit feature is useful for debugging. 01897 * It is the only safe way to use this method. 01898 * 01899 */ 01900 static rc_t get_du_statistics( 01901 const stid_t& stid, 01902 sm_du_stats_t& du, 01903 bool audit = true); 01904 01905 01906 /**\brief Analyze a volume and collect brief statistics about its usage. 01907 * \ingroup SSMVOL 01908 * @param[in] vid The volume of interest. 01909 * @param[out] volume_stats The statistics are written here. 01910 * @param[in] cc Indicates whether the volume is to be locked 01911 * by this method. Acceptable values are t_cc_none and t_cc_volume. 01912 * 01913 * If no lock is acquired, the method can fail with eRETRY. 01914 * 01915 */ 01916 static rc_t get_volume_meta_stats( 01917 vid_t vid, 01918 SmVolumeMetaStats& volume_stats, 01919 concurrency_t cc = t_cc_none 01920 ); 01921 01922 /**\brief Analyze a volume and collect brief statistics about its usage. 01923 * \ingroup SSMVOL 01924 * @param[in] vid The volume of interest. 01925 * @param[in] num_files The size of the array file_stats. 01926 * @param[out] file_stats Preallocated array of structs into which to 01927 * write the statistics for the individual files inspected. 01928 * @param[in] batch_calculate True means make one pass over the volume. 01929 * @param[in] cc Indicates whether the volume is to be locked 01930 * by this method. Acceptable values are t_cc_none and t_cc_volume. 01931 * 01932 * If no lock is acquired and batch_calculate is not set, 01933 * the method can fail with eRETRY. 01934 * 01935 * 01936 * If batch_calculate is true then this works by making one pass 01937 * over the meta data, but it looks at all the meta data. This 01938 * should be the faster way to do the analysis when there are 01939 * many files, and when files use a large portion of the volume. 01940 * 01941 * If batch_calculate is false then each file is updated 01942 * indidually, only looking at the extent information for that 01943 * particular file. This requires a pass over the volume for each 01944 * file. (Seek-wise it is less efficient). 01945 * 01946 */ 01947 static rc_t get_file_meta_stats( 01948 vid_t vid, 01949 w_base_t::uint4_t num_files, 01950 SmFileMetaStats* file_stats, 01951 bool batch_calculate = false, 01952 concurrency_t cc = t_cc_none 01953 ); 01954 01955 /**\brief Get the index ID of the root index of the volume. 01956 * \ingroup SSMVOL 01957 * 01958 * @param[in] v Volume of interest. 01959 * @param[out] iid Store ID of the root index. 01960 * \details 01961 * 01962 * Each volume has a root index, which is a well-known 01963 * index available to the server for bootstrapping a database. 01964 * 01965 */ 01966 static rc_t vol_root_index( 01967 const vid_t& v, 01968 stid_t& iid 01969 ) { iid.vol = v; iid.store = store_id_root_index; return RCOK; } 01970 01971 /***************************************************************** 01972 * storage operations: smfile.cpp 01973 *****************************************************************/ 01974 /**\addtogroup SSMSTORE 01975 * Indexes and files are special cases of "stores". 01976 * A store is a linked list of extents, and an extent is a 01977 * contiguous group of pages. So the store is the structure 01978 * that holds together an ordered set of pages that can be 01979 * used by a server and have an identifier (a store ID or stid_t). 01980 * 01981 * Indexes and files of records are built on stores. 01982 * 01983 * Stores have logging properties and 01984 * other metadata associated with them. 01985 * 01986 * The property that determines the logging level of the store is 01987 * \ref sm_store_property_t. 01988 * 01989 * Methods that let you get and change the metatdata are: 01990 * - ss_m::get_store_property 01991 * - ss_m::set_store_property 01992 * - ss_m::get_store_info 01993 * - \ref snum_t 01994 * 01995 * When a transaction deletes a file or index, the deletion of the 01996 * underlying stores is delayed until the transaction commits so that 01997 * the pages allocated to the stores remain reserved (lest the 01998 * transaction aborts). The deleting transaction could, in theory, 01999 * reuse the pages for another store, but in practice that is not done. 02000 * Instead, when a store is deleted, the store is marked 02001 * for deletion an put in a list for the transaction to delete upon 02002 * commit. At commit time, stores that have property t_load_file 02003 * or t_insert_file are converted to t_regular. 02004 */ 02005 02006 /**\brief Change the store property of a file or index. 02007 * \ingroup SSMSTORE 02008 * @param[in] stid File ID or index ID of the store to change. 02009 * @param[in] property Enumeration store_property_t (alias for 02010 * smlevel_3::sm_store_property_t, q.v.) 02011 * 02012 * \details 02013 * The possible uses of store properties are described with 02014 * smlevel_3::sm_store_property_t. 02015 */ 02016 static rc_t set_store_property( 02017 stid_t stid, 02018 store_property_t property 02019 ); 02020 02021 /**\brief Get the store property of a file or index. 02022 * \ingroup SSMSTORE 02023 * @param[in] stid File ID or index ID of the store of interest. 02024 * @param[in] property Reference to enumeration store_property_t 02025 * (alias for smlevel_3::sm_store_property_t, q.v.) 02026 * 02027 * \details 02028 * The possible uses of store properties are described with 02029 * smlevel_3::sm_store_property_t. 02030 */ 02031 static rc_t get_store_property( 02032 stid_t stid, 02033 store_property_t& property); 02034 02035 /**\brief Get various store information of a file or index. 02036 * \ingroup SSMSTORE 02037 * @param[in] stid File ID or index ID of the store of interest. 02038 * @param[out] info Reference to sm_store_info_t into which to 02039 * write the results. 02040 * 02041 * \details 02042 * Get internally stored information about a store. 02043 */ 02044 static rc_t get_store_info( 02045 const stid_t& stid, 02046 sm_store_info_t& info); 02047 02048 // 02049 // Functions for B+tree Indexes 02050 // 02051 /**\addtogroup SSMBTREE 02052 * The storage manager supports B+-Tree indexes provide associative access 02053 * to data by associating keys with values in 1:1 or many:1 relationships. 02054 * Keys may be composed of any of the basic C-language types (integer, 02055 * unsigned, floating-point of several sizes) or 02056 * variable-length character strings (wide characters are \b not supported). 02057 * 02058 * The number of key-value pairs that an index can hold is limited by the 02059 * space available on the volume containing the index. 02060 * \anchor max_entry_size 02061 * The combined sizes of the key and value must 02062 * be less than or equal to \ref max_entry_size, which is 02063 * a function of the page size, and is 02064 * such that two entries of this size fit on a page along with all 02065 * the page and entry metadata. See sm_config_info_t and ss_m::config_info. 02066 * 02067 * The minimum size of a B-Tree index is 8 pages (1 extent). 02068 * 02069 * A variety of locking protocols is supported: 02070 * - none : acquire no locks on the {key,value} pairs in the index, 02071 * although an intention lock might be acquired on the index. 02072 * - kvl : key-value locking See \ref MOH1. The key or 02073 * key-value pair is hashed into a 4-byte value and used with the 02074 * given store id to make a lock id. 02075 * - im : index-management locking See \ref MOH1. 02076 * The "value" portion of 02077 * the key-value lock is taken to be a record id, which is used 02078 * for the lock id. 02079 * - modified kvl : an ad-hoc protocol used by the Paradise project. See \ref MODKVL "the scan_index_i constructor". As with index-management locking, 02080 * the "value" portion of 02081 * the key-value lock is taken to be a record id, which is used 02082 * for the lock id. 02083 * - file : full-index locking. 02084 * 02085 * \section key_description Key Types 02086 * A B+-Tree index key has a type determined when the index is created. 02087 * All keys are stored in lexicographic format based on an interpretation of 02088 * the key determined by the key description given when the index is 02089 * created. 02090 * Lookups on the B+-Tree then involve a single byte-by-byte 02091 * comparison of two byte-strings, each composed of its concatenated 02092 * sub-keys. 02093 * 02094 * The key description is a null-terminated string as follows: 02095 \verbatim 02096 <key_decription> ::= <fixed_len_part>* <variable_len_part> | 02097 <fixed_len_part>+ 02098 <fixed_len_part> ::= <type> <len> 02099 <variable_len_part> ::= <type> '*' <len> 02100 <type> ::= 'i' | 'u' | 'f' | 'b' | 'I' | 'U' | 'F' | 'B' 02101 <len> ::= [1-9][0-9]* 02102 \endverbatim 02103 * Thus, a key may have any number of fixed-length parts followed by at 02104 * most one variable-length part. 02105 * 02106 * The fixed-length parts (if present) consist of a type and a length. 02107 * 02108 * The variable-length part (if present) consists of a type and a length 02109 * separated by an asterisk, which is what distinguishes a variable-length 02110 * from a fixed-length part. 02111 * 02112 * Types and permissible lengths are: 02113 * - integer (1,2,4,8) 02114 * - unsigned (1,2,4,8) 02115 * - floating (4,8) 02116 * - uninterpreted byte (any length greater than zero) 02117 * 02118 * A capital letter indicates that the key part may be compressed. Only prefix 02119 * compression is implemented, so it makes sense to compress if the 02120 * first part of the key is compressible. 02121 * 02122 * Examples: 02123 * - "B40u4u2u2" : 40-byte character string followed by a 4-byte integer, 02124 * a 2-byte integer and a 2-byte integer, such as one might 02125 * use for name.year.mo.day. The character string is 02126 * prefix-compressed. 02127 * - "f8" : an 8-byte floating-point number (double) 02128 * - "I8B*1000" : An 8-byte integer followed by an uninterpreted string 02129 * of up to 1000 bytes, all prefix-compressed. 02130 * 02131 * \note Wide characters are not supported. 02132 * 02133 * This key descriptor is stored in the sm_store_info_t, which is 02134 * stored on the volume and is available with the method ss_m::get_store_info. 02135 * Keys are stored in \ref LEXICOFORMAT "lexicographic format". The 02136 * storage manager knows how to convert all the key types listed above. 02137 * When duplicates are permitted, the index assumes that the elements 02138 * are in lexicographic order when searching for a <key,element> pair. 02139 * 02140 * \section XXXX1 Bulk Loading 02141 * Bulk-loading of all index types is supported. See \ref SSMBULKLD. 02142 */ 02143 02144 02145 /**\brief Create a B+-Tree index. 02146 * \ingroup SSMBTREE 02147 * @param[in] vid Volume on which to create the index. 02148 * @param[in] ntype Type of index. Legitimate values are: 02149 * - t_btree : B+-Tree with duplicate keys allowed 02150 * - t_uni_btree : B+-Tree without duplicate keys 02151 * @param[in] property Logging level of store. Legitimate values are: 02152 * - t_regular 02153 * - t_load_file 02154 * - t_insert_file 02155 * See sm_store_property_t for details. 02156 * @param[in] key_desc Description of key type. 02157 * See \ref key_description for details. 02158 * @param[in] cc The locking protocol to use with this index. See 02159 * smlevel_0::concurrency_t and \ref SSMBTREE. 02160 * @param[out] stid New store ID will be returned here. 02161 */ 02162 static rc_t create_index( 02163 vid_t vid, 02164 ndx_t ntype, 02165 store_property_t property, 02166 const char* key_desc, 02167 concurrency_t cc, 02168 stid_t& stid 02169 ); 02170 02171 /**\brief Create a B+-Tree or R*-Tree index. 02172 * \ingroup SSMBTREE 02173 *\attention For backward compatibility. Will be deprecated later. 02174 */ 02175 static rc_t create_index( 02176 vid_t vid, 02177 ndx_t ntype, 02178 store_property_t property, 02179 const char* key_desc, 02180 stid_t& stid 02181 ); 02182 02183 /**\brief Destroy a B+-Tree index. 02184 * \ingroup SSMBTREE 02185 * 02186 * @param[in] iid ID of the index to be destroyed. 02187 */ 02188 static rc_t destroy_index(const stid_t& iid); 02189 02190 /**\brief Bulk-load a B+-Tree index from multiple data sources. 02191 * \ingroup SSMBULKLD 02192 * 02193 * @param[in] stid ID of the index to be loaded. 02194 * @param[in] nsrcs Number of files used for data sources. 02195 * @param[in] source Array of IDs of files used for data sources. 02196 * @param[out] stats Statistics concerning the load activity will be 02197 * written here. 02198 * @param[in] sort_duplicates If "true" the bulk-load will sort 02199 * duplicates by value. 02200 * @param[in] lexify_keys If "true" the keys are assumed not to 02201 * be in 02202 * lexicographic format, and the bulk-load will reformat the key before 02203 * storing it in the index, 02204 * otherwise they are assumed already to be in lexicographic format. 02205 * 02206 * \anchor LEXICOFORMAT 02207 * \b Lexicographic \b format 02208 * is the translation of numbers 02209 * (int, float, double, unsigned, etc) into byte strings 02210 * such that a lexicographic comparison of the byte strings 02211 * yields the same result as the numeric comparison of the 02212 * original data. 02213 * 02214 * \note The data must already have been sorted by 02215 * key in lexicographic format, but the keys themselves don't have 02216 * to be in lexicographic format; if the keys are not already in 02217 * lexicographic format, the \a lexify_keys must be given the value "true". 02218 * 02219 * In the case of duplicate keys, the bulk-load will handle the 02220 * sorting of the elements if \a sort_duplicates is "true"; this 02221 * sort will be done by a lexicographic comparison of the 02222 * byte strings that compose the elements. 02223 */ 02224 static rc_t bulkld_index( 02225 const stid_t& stid, 02226 int nsrcs, 02227 const stid_t* source, 02228 sm_du_stats_t& stats, 02229 bool sort_duplicates = true, 02230 bool lexify_keys = true 02231 ); 02232 /**\brief Bulk-load a B+-Tree index from a single data source. 02233 * \ingroup SSMBULKLD 02234 * 02235 * @param[in] stid ID of the index to be loaded. 02236 * @param[in] source IDs of file used for data source. 02237 * @param[out] stats Statistics concerning the load activity will be 02238 * written here. 02239 * @param[in] sort_duplicates If "true" the bulk-load will sort 02240 * duplicates by value. 02241 * @param[in] lexify_keys If "true" the keys are assumed not to 02242 * be in 02243 * lexicographic format, and the bulk-load will reformat the key before 02244 * storing it in the index, 02245 * otherwise they are assumed already to be in lexicographic format. 02246 */ 02247 static rc_t bulkld_index( 02248 const stid_t& stid, 02249 const stid_t& source, 02250 sm_du_stats_t& stats, 02251 bool sort_duplicates = true, 02252 bool lexify_keys = true 02253 ); 02254 /**\brief Bulk-load a B+-Tree index from a single data stream. 02255 * \ingroup SSMBULKLD 02256 * 02257 * @param[in] stid ID of the index to be loaded. 02258 * @param[in] sorted_stream Iterator that serves as the data source. 02259 * @param[out] stats Statistics concerning the load activity will be 02260 * written here. 02261 * 02262 * See sort_stream_i. 02263 */ 02264 static rc_t bulkld_index( 02265 const stid_t& stid, 02266 sort_stream_i& sorted_stream, 02267 sm_du_stats_t& stats); 02268 02269 /**\cond skip */ 02270 static rc_t print_index(stid_t stid); 02271 /**\endcond skip */ 02272 02273 /**\brief Create an entry in a B+-Tree index. 02274 * \ingroup SSMBTREE 02275 * 02276 * @param[in] stid ID of the index. 02277 * @param[in] key Key for the association to be created. 02278 * @param[in] el Element for the association to be created. 02279 * 02280 * The combined sizes of the key and element vectors must 02281 * be less than or equal to \ref max_entry_size. 02282 */ 02283 static rc_t create_assoc( 02284 stid_t stid, 02285 const vec_t& key, 02286 const vec_t& el 02287 #ifdef SM_DORA 02288 , const bool bIgnoreLocks = false 02289 #endif 02290 ); 02291 /**\brief Remove an entry from a B+-Tree index. 02292 * \ingroup SSMBTREE 02293 * 02294 * @param[in] stid ID of the index. 02295 * @param[in] key Key of the entry to be removed. 02296 * @param[in] el Element (value) of the entry to be removed. 02297 */ 02298 static rc_t destroy_assoc( 02299 stid_t stid, 02300 const vec_t& key, 02301 const vec_t& el 02302 #ifdef SM_DORA 02303 , const bool bIgnoreLocks = false 02304 #endif 02305 ); 02306 /**\brief Destroy all entries associated with a key in a B+-Tree index. 02307 * \ingroup SSMBTREE 02308 * 02309 * @param[in] stid ID of the index. 02310 * @param[in] key Key of the entries to be removed. 02311 * @param[out] num_removed The number of entries removed is returned here. 02312 */ 02313 static rc_t destroy_all_assoc( 02314 stid_t stid, 02315 const vec_t& key, 02316 int& num_removed 02317 ); 02318 /**\brief Find an entry associated with a key in a B+-Tree index. 02319 * \ingroup SSMBTREE 02320 * 02321 * @param[in] stid ID of the index. 02322 * @param[in] key Key of the entries to be removed. 02323 * @param[out] el Element associated with the given key will be copied into this buffer. 02324 * @param[in] elen Length of buffer into which the 02325 * result will be written. If too small, eRECWONTFIT will 02326 * be returned. 02327 * Length of result will be returned here. 02328 * @param[out] found True if an entry is found. 02329 * 02330 * If the index is not unique (allows duplicates), the first 02331 * element found with the given key will be returned. 02332 * 02333 * To locate all entries associated with a non-unique key, you must 02334 * use scan_index_i, q.v.. 02335 */ 02336 static rc_t find_assoc( 02337 stid_t stid, 02338 const vec_t& key, 02339 void* el, 02340 smsize_t& elen, 02341 bool& found 02342 #ifdef SM_DORA 02343 , const bool bIgnoreLocks = false 02344 #endif 02345 ); 02346 02347 // 02348 // Functions for R*tree (multi-dimensional(MD), spatial) Indexes 02349 // 02350 02351 /**\addtogroup SSMRTREE 02352 * 02353 * An R-tree is a height-balanced structure designed for indexing 02354 * multi-dimensional spatial objects. 02355 * It stores the minimial bounding box (with 2 or higher dimension) of 02356 * a spatial object as the key in the leaf pages. 02357 * This implementation is a variant of an R-Tree called an R*-Tree, which 02358 * improves the search performance by using a heuristic for redistributing 02359 * entries and dynamically reorganizing the tree during insertion. 02360 * 02361 * An R*-Tree stores key,value pairs where the key is of type nbox_t 02362 * and the value is of type vec_t. 02363 * 02364 * The number of key-value pairs an index can hold is limited by the space 02365 * available on the volume containing the index. 02366 * The minimum size of an R*-tree index is 8 pages. 02367 * 02368 * 02369 * \note This implementation 02370 * uses coarse-grained (index-level) locking and 02371 * supports only 2 dimensions and integer coordinates. 02372 * For information about R*-trees, see the \ref BKSS. 02373 * 02374 * Example: 02375 * \code 02376 scan_rt_i scan(idx, nbox_t::t_overlap, universe, true); 02377 bool eof; 02378 nbox_t k; 02379 char* e; 02380 smsize_t elen; 02381 02382 for(int i=0; 02383 (!(rc = scanp->next(k,e,elen,eof)).is_error() && !eof); 02384 i++) ; 02385 cout << "Rtree " << idx << " contains " << i << " entries." << endl; 02386 \endcode 02387 * 02388 * 02389 * \section XXXX2 Bulk Loading 02390 * Bulk-loading of all index types is supported. See \ref SSMBULKLD. 02391 */ 02392 /*\example rtree_example.cpp*/ 02393 02394 02395 /**\brief Create an R*-Tree (multi-dimensional spatial) index. 02396 * \ingroup SSMRTREE 02397 * @param[in] vid Volume on which to create the index. 02398 * @param[in] ntype Type of index. Legitimate values are: 02399 * - t_rtree : R*-Tree 02400 * @param[in] property Logging level of store. Legitimate values are: 02401 * - t_temporary 02402 * - t_regular 02403 * - t_load_file 02404 * - t_insert_file 02405 * See sm_store_property_t for details. 02406 * @param[in] dim Number of dimensions of the key. 02407 * They key type is an nbox_t. 02408 * See \ref nbox_t for details. 02409 * @param[out] stid New store ID will be returned here. 02410 */ 02411 static rc_t create_md_index( 02412 vid_t vid, 02413 ndx_t ntype, 02414 store_property_t property, 02415 stid_t& stid, 02416 int2_t dim = 2 02417 ); 02418 02419 /**\brief Destroy an R*-Tree index. 02420 * \ingroup SSMRTREE 02421 * 02422 * @param[in] iid ID of the index to be destroyed. 02423 */ 02424 static rc_t destroy_md_index(const stid_t& iid); 02425 02426 /**\brief Bulk-load a multi-dimensional index from multiple sources. 02427 * \ingroup SSMBULKLD 02428 * @param[in] stid ID of the index to be loaded. 02429 * @param[in] nsrcs Number of files used for data sources. 02430 * @param[in] source Array of IDs of files used for data sources. 02431 * @param[out] stats Statistics concerning the load activity will be 02432 * written here. 02433 * @param[in] hff Heuristic fill factor. Not used. 02434 * @param[in] hef Heuristic expansion factor. Not used. 02435 * @param[in] universe Universal bounding box of all spatial objects indexed. 02436 */ 02437 static rc_t bulkld_md_index( 02438 const stid_t& stid, 02439 int nsrcs, 02440 const stid_t* source, 02441 sm_du_stats_t& stats, 02442 int2_t hff=75, 02443 int2_t hef=120, 02444 nbox_t* universe=NULL); 02445 02446 /**\brief Bulk-load a multi-dimensional index from a single source. 02447 * \ingroup SSMBULKLD 02448 * @param[in] stid ID of the index to be loaded. 02449 * @param[in] source ID of file to be used for data source. 02450 * @param[out] stats Statistics concerning the load activity will be 02451 * written here. 02452 * @param[in] hff Heuristic fill factor. Not used. 02453 * @param[in] hef Heuristic expansion factor. Not used. 02454 * @param[in] universe Universal bounding box of all spatial objects indexed. 02455 */ 02456 static rc_t bulkld_md_index( 02457 const stid_t& stid, 02458 const stid_t& source, 02459 sm_du_stats_t& stats, 02460 int2_t hff=75, 02461 int2_t hef=120, 02462 nbox_t* universe=NULL); 02463 02464 /**\brief Bulk-load a multi-dimensional index from a sorted stream source. 02465 * \ingroup SSMBULKLD 02466 * @param[in] stid ID of the index to be loaded. 02467 * @param[in] sorted_stream Input stream that is data source. 02468 * @param[out] stats Statistics concerning the load activity will be 02469 * written here. 02470 * @param[in] hff Heuristic fill factor. Not used. 02471 * @param[in] hef Heuristic expansion factor. Not used. 02472 * @param[in] universe Universal bounding box of all spatial objects indexed. 02473 */ 02474 static rc_t bulkld_md_index( 02475 const stid_t& stid, 02476 sort_stream_i& sorted_stream, 02477 sm_du_stats_t& stats, 02478 int2_t hff=75, 02479 int2_t hef=120, 02480 nbox_t* universe=NULL); 02481 02482 static rc_t print_md_index(stid_t stid); 02483 02484 /**\brief Look up an entry in a multi-dimensional index. 02485 * \ingroup SSMRTREE 02486 * 02487 * @param[in] stid ID of the index. 02488 * @param[in] key Key associated with the entry to look up. 02489 * @param[out] el Element associated with the given key will be copied into this buffer. 02490 * @param[in] elen Length of buffer into which the 02491 * result will be written. If too small, eRECWONTFIT will 02492 * be returned. 02493 * Length of result will be returned here. 02494 * @param[out] found True if an entry is found. 02495 */ 02496 static rc_t find_md_assoc( 02497 stid_t stid, 02498 const nbox_t& key, 02499 void* el, 02500 smsize_t& elen, 02501 bool& found); 02502 02503 /**\brief Create an entry in a multi-dimensional index. 02504 * \ingroup SSMRTREE 02505 * 02506 * @param[in] stid ID of the index. 02507 * @param[in] key Key for the association to be created. 02508 * @param[in] el Element for the association to be created. 02509 */ 02510 static rc_t create_md_assoc( 02511 stid_t stid, 02512 const nbox_t& key, 02513 const vec_t& el); 02514 02515 /**\brief Destroy an entry in a multi-dimensional index. 02516 * \ingroup SSMRTREE 02517 * 02518 * @param[in] stid ID of the index. 02519 * @param[in] key Key of the entry to be removed. 02520 * @param[in] el Element (value) of the entry to be removed. 02521 */ 02522 static rc_t destroy_md_assoc( 02523 stid_t stid, 02524 const nbox_t& key, 02525 const vec_t& el); 02526 02527 /**\cond skip */ 02528 // for debugging 02529 static rc_t draw_rtree(const stid_t& stid, ostream &); 02530 /**\endcond skip */ 02531 02532 /**\brief Gather usage statistics about an R*-Tree index. 02533 * \ingroup SSMRTREE 02534 * @param[in] stid ID of the index. 02535 * @param[out] stat Usage statistics will be written here. 02536 * @param[in] size Number of uint2_t's in the array ovp. 02537 * @param[out] ovp Pre-allocated array of integers into which 02538 * the method will write the overlap percentages for each level of the 02539 * tree. 02540 * @param[in] audit If "true", the method 02541 * will check assertions about the 02542 * correctness of the rtree. 02543 * If the audit fails an internal fatal error is generated 02544 * to facilitate debugging. (It will generate a core file if your 02545 * shell permits such.) 02546 * 02547 * \note for debugging 02548 */ 02549 static rc_t rtree_stats( 02550 const stid_t& stid, 02551 rtree_stats_t& stat, 02552 uint2_t size = 0, 02553 uint2_t* ovp = NULL, 02554 bool audit = false); 02555 02556 /**\addtogroup SSMFILE 02557 * You can create, destroy, and scan files of records. You may exert some 02558 * control over the order in which records appear in the file (a physical 02559 * scan), but, in general, the storage manager decides where to put records. 02560 * 02561 * Pages in a file are slotted pages: Each page contains an array of 02562 * slots. 02563 * Records take one of three forms: small, large, and very large. 02564 * - Small records fit in the slots on the file pages. 02565 * - Large records are too big to fit on a slotted page, so they are put 02566 * elsewhere, and the slots point to these records. Actually, what is 02567 * in a slot is a small array of page pointers to the data of the large record. 02568 * - A very large record is one whose slot in the file page contains 02569 * a single reference to a page that is an index of data pages. 02570 * 02571 * Because records may take these forms, the API for creating records 02572 * contains the opportunity for you to provide a hint about the ultimate 02573 * size of the record so that the storage manager can create the proper 02574 * structure for the record immediately, rather than creating a small 02575 * record that is soon to be converted to a large, then a very large record 02576 * by subsequent appends. 02577 * 02578 * All records contain a client-defined header. This is for the convenience 02579 * of server-writers. The header must fit on the slotted page, so it should 02580 * never be very large. 02581 * 02582 * The following methods manipulate files of records and the records found 02583 * there. 02584 * 02585 * Modules below describe file traversal and 02586 * appending to files (\ref SSMSCANF), 02587 * and pinning individual records in the buffer pool for extended operations 02588 * (\ref SSMPIN). 02589 * 02590 * \section UNINIT Uninitialized Data 02591 * The functions create_rec, append_rec, and update_rec can be used to 02592 * write blocks of data that are all zeroes, with minimal logging. 02593 * This is useful for creating records of known size but with uninitialized data. 02594 * The type zvec_t, a special case of vec_t, is for this purpose. 02595 * Construct it with only a size, as follows: 02596 * \code 02597 * zvec_t zdata(100000); 02598 * \endcode 02599 * The underlying logging code recognizes that this is a vector of zeroes and 02600 * logs only a count, not the data themselves. 02601 * 02602 * \section Errors 02603 * If an error occurs in the middle of one of these methods that is updating persistent data, 02604 * the record or file \e could be in an inconsistent state. 02605 * The caller has the choice of aborting the transaction or rolling back to the nearest savepoint (see \ref SSMXCT). 02606 * 02607 * \sa SSMSCAN, SSMPIN, vec_t, zvec_t, IDs. 02608 */ 02609 02610 /**\brief Create a file of records. 02611 * \ingroup SSMFILE 02612 * \details 02613 * @param[in] vid Volume on which to create a file. 02614 * @param[out] fid Returns (store) ID of the new file here. 02615 * @param[in] property Give the file the this property. 02616 * @param[in] cluster_hint Not used. 02617 * 02618 * The cluster hint is included in the API for future use. 02619 * It has no effect. 02620 */ 02621 static rc_t create_file( 02622 vid_t vid, 02623 stid_t& fid, 02624 store_property_t property, 02625 shpid_t cluster_hint = 0 02626 ); 02627 02628 /**\brief Destroy a file of records. 02629 * \ingroup SSMFILE 02630 * \details 02631 * @param[in] fid ID of the file to destroy. 02632 */ 02633 static rc_t destroy_file(const stid_t& fid); 02634 02635 /**\brief Create a new record. 02636 * \ingroup SSMFILE 02637 * \details 02638 * @param[in] fid ID of the file in which to create a record. 02639 * @param[in] hdr What to put in the record's header. 02640 * @param[in] len_hint Hint about how big the record will ultimately be. 02641 * This is used to determine the initial format of the record. If you plan 02642 * to append to the record and know that it will ultimately become a large 02643 * record, it is more efficient to give a size hint that is larger than 02644 * a page here. Otherwise, the record will be made small (as determined by 02645 * the size of the parameter \a data ), and subsequent appends will cause 02646 * the record to be converted to a large record. 02647 * @param[in] data What to put in the record's body. 02648 * @param[out] new_rid ID of the newly created record. 02649 */ 02650 static rc_t create_rec( 02651 const stid_t& fid, 02652 const vec_t& hdr, 02653 smsize_t len_hint, 02654 const vec_t& data, 02655 rid_t& new_rid 02656 #ifdef SM_DORA 02657 , const bool bIgnoreLocks = false 02658 #endif 02659 ); 02660 02661 /**\brief Destroy a record. 02662 * \ingroup SSMFILE 02663 * \details 02664 * @param[in] rid ID of the record to destroy. 02665 */ 02666 static rc_t destroy_rec(const rid_t& rid 02667 #ifdef SM_DORA 02668 , const bool bIgnoreLocks = false 02669 #endif 02670 ); 02671 02672 /**\brief Modify the body of an existing record. 02673 * \ingroup SSMFILE 02674 * \details 02675 * @param[in] rid ID of the record to modify. 02676 * @param[in] start First byte to change. 02677 * @param[in] data What to put in the record's body. 02678 * 02679 * This overwrites 02680 * the existing bytes, starting at the offset \a start through the 02681 * byte at \a start + \a data.size(). 02682 * This method \b cannot \b be \b used to change the size of a record. 02683 * Attempting this will result in an error. 02684 */ 02685 static rc_t update_rec( 02686 const rid_t& rid, 02687 smsize_t start, 02688 const vec_t& data); 02689 02690 /**\brief Modify the header of an existing record. 02691 * \ingroup SSMFILE 02692 * \details 02693 * @param[in] rid ID of the record to modify. 02694 * @param[in] start First byte to change. 02695 * @param[in] hdr What to put in the record's header. 02696 * 02697 * This overwrites 02698 * the existing bytes, starting at the offset \a start through the 02699 * byte at \a start + \a data.size(). 02700 * This method \b cannot \b be \b used to change the size of a record 02701 * header. There are no methods for appending to or truncating a 02702 * record header. 02703 * 02704 * \sa pin_i::update_rec, \ref SSMPIN 02705 */ 02706 static rc_t update_rec_hdr( 02707 const rid_t& rid, 02708 smsize_t start, 02709 const vec_t& hdr); 02710 // see also pin_i::update_rec*() 02711 02712 /**\brief Append bytes to a record body. 02713 * \ingroup SSMFILE 02714 * \details 02715 * @param[in] rid ID of the record to modify. 02716 * @param[in] data What to append to the record. 02717 * 02718 * \note This appends \b to a record; it does \b not append a record to a file! 02719 * \sa pin_i::append_rec, \ref SSMPIN 02720 */ 02721 static rc_t append_rec( 02722 const rid_t& rid, 02723 const vec_t& data 02724 ); 02725 02726 /**\brief Chop bytes off the end of a record body. 02727 * \ingroup SSMFILE 02728 * \details 02729 * @param[in] rid ID of the record to modify. 02730 * @param[in] amount How many bytes to lop off. 02731 * 02732 * \sa pin_i::truncate_rec, \ref SSMPIN 02733 */ 02734 static rc_t truncate_rec( 02735 const rid_t& rid, 02736 smsize_t amount 02737 ); 02738 02739 /**\brief Chop bytes off the end of a record body. 02740 * \ingroup SSMFILE 02741 * \details 02742 * @param[in] rid ID of the record to modify. 02743 * @param[in] amount How many bytes to lop off. 02744 * @param[out] should_forward Returns true if the record started out 02745 * large but is now small as a result of the truncation. 02746 * This enables a value-added server to take action in this event, 02747 * should it so desire. 02748 * 02749 * \sa pin_i::truncate_rec, \ref SSMPIN 02750 */ 02751 static rc_t truncate_rec( 02752 const rid_t& rid, 02753 smsize_t amount, 02754 bool& should_forward 02755 ); 02756 02757 #ifdef OLDSORT_COMPATIBILITY 02758 typedef ssm_sort::key_info_t key_info_t; 02759 02760 /* old sort physical version */ 02761 /**\brief Sort a file. Deprecated. 02762 * \details 02763 */ 02764 static rc_t sort_file( 02765 const stid_t& fid, 02766 vid_t vid, 02767 stid_t& sfid, 02768 store_property_t property, 02769 const key_info_t& key_info, 02770 int run_size, 02771 bool ascending = true, 02772 bool unique = false, 02773 bool destructive = false, 02774 bool use_new_sort = true); 02775 02776 /**\brief Sort a file. Deprecated. 02777 * \details 02778 */ 02779 static rc_t new_sort_file( 02780 const stid_t& fid, 02781 vid_t vid, 02782 stid_t& sfid, 02783 store_property_t property, 02784 const key_info_t& key_info, 02785 int run_size, 02786 bool ascending = true, 02787 bool unique = false, 02788 bool destructive = false 02789 ); 02790 #endif /* OLDSORT_COMPATIBILITY */ 02791 02792 typedef ssm_sort::sort_keys_t sort_keys_t; 02793 02794 /* new sort physical version : see notes below */ 02795 /**\brief Sort a file. 02796 * \ingroup SSMSORT 02797 * @param[in] fid File to sort. 02798 * @param[in] sorted_fid File to which to write the results. 02799 * @param[in] nvids Size of array \a vid. 02800 * @param[in] vid Array of IDs of scratch files created by the caller. 02801 * @param[in] kl See sort_keys_t. 02802 * @param[in] min_rec_sz Hint of minimum record size in input file. 02803 * @param[in] run_size Number of pages in buffer pool to use for a run. 02804 * @param[in] temp_space Number of pages to use for scratch space. 02805 * (This limits the amount of memory used by the sort). 02806 * 02807 * \details 02808 * Before you call sort_file, you must create an output file \a sorted_fid 02809 * into which sort_file will write the results. 02810 * 02811 * The sort uses temporary files when the input file contains more records 02812 * than can fit in one run (determined by \a run_size). These temporary files 02813 * may be spread across multiple volumes, which is useful if the 02814 * volumes reside on different spindles. The arguments \a nvids 02815 * and \a vid are for indicating the volumes to use for these scratch 02816 * files. 02817 * 02818 * The caller can provide a clue in \a min_rec_size 02819 * about the minimum record size of the 02820 * input file, which can help the sort's efficiency. 02821 * 02822 * The \a run_size indicates how many buffer-pool pages to use 02823 * for each run. 02824 * Since at all times one page is fixed for output, while the rest are 02825 * for reading the input in runs, the real run size is \a run_size-1. 02826 * 02827 */ 02828 static rc_t sort_file( 02829 const stid_t& fid, // input file 02830 const stid_t& sorted_fid, // output file 02831 int nvids, // array size for vids 02832 const vid_t* vid, // array of vids for temp 02833 // files 02834 // created by caller-- 02835 // can be same as input file 02836 sort_keys_t& kl, // kl & 02837 smsize_t min_rec_sz, // for estimating space use 02838 int run_size, // # pages to use for a run 02839 int temp_space // # pages VM to use for scratch 02840 ); 02841 02842 /**\brief Return the short volume ID of a volume. 02843 * \ingroup SSMVOL 02844 * 02845 * @param[in] lvid Long (persistent) volume ID found on the volume's 02846 * header. 02847 * @param[out] vid Short volume ID of a mounted volume. 02848 */ 02849 static rc_t lvid_to_vid( 02850 const lvid_t& lvid, 02851 vid_t& vid); 02852 02853 /**\brief Return the long volume ID of a volume. 02854 * \ingroup SSMVOL 02855 * 02856 * @param[in] vid Short volume ID of a mounted volume. 02857 * @param[out] lvid Long (persistent) volume ID found on the volume's 02858 * header. 02859 */ 02860 static rc_t vid_to_lvid( 02861 vid_t vid, 02862 lvid_t& lvid); 02863 02864 /***************************************************************** 02865 * Locking related functions 02866 * 02867 * NOTE: there are standard conversions from lpid_t, rid_t, and 02868 * stid_t to lockid_t, so wherever a lockid_t parameter is 02869 * specified a lpid_t, rid_t, or stid_t can be used. 02870 * 02871 *****************************************************************/ 02872 02873 #if SLI_HOOKS 02874 /* enable/disable SLI globally for all threads created after this 02875 point. Does *NOT* disable SLI for existing threads. 02876 */ 02877 static void set_sli_enabled(bool enabled); 02878 static void set_elr_enabled(bool enabled); 02879 02880 static rc_t set_log_features(char const* features); 02881 static char const* get_log_features(); 02882 #endif 02883 02884 /**\brief Acquire a lock. 02885 * \ingroup SSMLOCK 02886 * @param[in] n Lock id of the entity to lock. There are 02887 * conversions from record ids, volume ids, store ids, and page ids to 02888 * lockid_t. 02889 * @param[in] m Desired lock mode. Values: EX, SH. 02890 * @param[in] d Desired duration. Values: 02891 * - t_very_long : Held across transaction boundaries; 02892 * cannot be released by unlock() 02893 * - t_long : Released at commit; cannot be released by unlock() 02894 * - t_medium : May be released early by explicit unlock() 02895 * - t_short : May be released early by explicit unlock() 02896 * - t_instant : Not held: acquired and released immediately. Useful 02897 * to see if any other transaction holds an incompatible lock. 02898 * @param[in] timeout Milliseconds willing to block. See timeout_in_ms. 02899 * 02900 * The lock manager is written with these durations in mind, but the 02901 * only durations used by the storage manager are t_instant and t_long. 02902 * Medium-duration locks are used internally in a one place. 02903 * 02904 * Durations other than long and instant are not well-tested. 02905 */ 02906 static rc_t lock( 02907 const lockid_t& n, 02908 lock_mode_t m, 02909 lock_duration_t d = t_long, 02910 timeout_in_ms timeout = WAIT_SPECIFIED_BY_XCT 02911 ); 02912 02913 /**\brief Release a lock. 02914 * \ingroup SSMLOCK 02915 * @param[in] n Lock id of the entity to lock. There are 02916 * conversions from record ids, volume ids, store ids, and page ids to 02917 * lockid_t. 02918 */ 02919 static rc_t unlock(const lockid_t& n); 02920 02921 /**\brief Disable lock escalation on the given entity. 02922 * \ingroup SSMLOCK 02923 * @param[in] n Lock id of the entity to lock. There are 02924 * conversions from record ids, volume ids, store ids, and page ids to 02925 * lockid_t. 02926 * @param[in] passOnToDescendants If true, apply this to the descendants 02927 * of \a n. 02928 */ 02929 static rc_t dont_escalate( 02930 const lockid_t& n, 02931 bool passOnToDescendants = true 02932 ); 02933 02934 /**\brief Find the storage-manager-wide escalation thresholds 02935 * \ingroup SSMLOCK 02936 * Default values (used for all transactions until they change 02937 * their per-transaction thresholds) are determined by the 02938 * storage-manager-wide options. 02939 * See \ref SSMOPT. 02940 */ 02941 static rc_t get_escalation_thresholds( 02942 w_base_t::int4_t& toPage, 02943 w_base_t::int4_t& toStore, 02944 w_base_t::int4_t& toVolume); 02945 02946 /**\brief Change the storage-manager-wide escalation thresholds 02947 * \ingroup SSMLOCK 02948 * Default values (used for all transactions until they change 02949 * their per-transaction thresholds) are determined by the 02950 * storage-manager-wide options. 02951 * See \ref SSMOPT. 02952 */ 02953 static rc_t set_escalation_thresholds( 02954 w_base_t::int4_t toPage, 02955 w_base_t::int4_t toStore, 02956 w_base_t::int4_t toVolume); 02957 02958 /**\brief Find out if the attached transaction has an entity locked. 02959 * \ingroup SSMLOCK 02960 * @param[in] n Lock id of the entity to lock. There are 02961 * conversions from record ids, volume ids, store ids, and page ids to 02962 * lockid_t. 02963 * @param[out] m Mode of lock held. NL if none. 02964 * @param[in] implicit If "true" the query will returns a lock mode if 02965 * an implicit lock is held, otherwise the lock must be held explicitly. 02966 */ 02967 static rc_t query_lock( 02968 const lockid_t& n, 02969 lock_mode_t& m, 02970 bool implicit = false 02971 ); 02972 02973 /***************************************************************** 02974 * Lock Cache related functions 02975 * 02976 * Each transaction has a cache of recently acquired locks 02977 * The following functions control the use of the cache. 02978 * Note that the functions affect the transaction currently 02979 * associated with the thread. 02980 *****************************************************************/ 02981 // turn on(enable=true) or off/(enable=false) the lock cache 02982 // return previous state. 02983 /**\brief Control lock caching for attached transaction. 02984 * \ingroup SSMLOCK 02985 * 02986 * @param[in] enable Set to true if you want to turn on lock caching 02987 * for the attached transaction. The default is that it is turned on. 02988 * 02989 * Only long-duration locks are cached. 02990 * Lock caching can be turned off by default using the 02991 * sm_lock_caching option. Even with it turned off by default, it 02992 * can be turned on for a given transcation with this method. 02993 * 02994 */ 02995 static rc_t set_lock_cache_enable(bool enable); 02996 02997 /**\brief True if lock cache is enabled for the attached transaction 02998 * \ingroup SSMLOCK 02999 * 03000 * @param[out] enabled Will be set to true if the attached transaction has 03001 * lock caching enabled, false otherwise. 03002 */ 03003 static rc_t lock_cache_enabled(bool& enabled); 03004 03005 private: 03006 03007 static int _instance_cnt; 03008 static option_group_t* _options; 03009 static option_t* _hugetlbfs_path; 03010 static option_t* _reformat_log; 03011 static option_t* _prefetch; 03012 static option_t* _bufpoolsize; 03013 static option_t* _locktablesize; 03014 static option_t* _logdir; 03015 static option_t* _logsize; 03016 static option_t* _logbufsize; 03017 static option_t* _error_log; 03018 static option_t* _error_loglevel; 03019 static option_t* _lockEscalateToPageThreshold; 03020 static option_t* _lockEscalateToStoreThreshold; 03021 static option_t* _lockEscalateToVolumeThreshold; 03022 static option_t* _cc_alg_option; 03023 static option_t* _log_warn_percent; 03024 static option_t* _num_page_writers; 03025 static option_t* _logging; 03026 static option_t* _lock_caching_default; 03027 03028 03029 static rc_t _set_option_logsize( 03030 option_t* opt, 03031 const char* value, 03032 ostream* err_stream); 03033 03034 static rc_t _set_option_lock_escalate_to_page( 03035 option_t* opt, 03036 const char* value, 03037 ostream* err_stream); 03038 03039 static rc_t _set_option_lock_escalate_to_store( 03040 option_t* opt, 03041 const char* value, 03042 ostream* err_stream); 03043 03044 static rc_t _set_option_lock_escalate_to_volume( 03045 option_t* opt, 03046 const char* value, 03047 ostream* err_stream); 03048 03049 static rc_t _set_store_property( 03050 stid_t stid, 03051 store_property_t property); 03052 03053 static rc_t _get_store_property( 03054 stid_t stid, 03055 store_property_t& property); 03056 03057 static rc_t _begin_xct( 03058 sm_stats_info_t* stats, // allocated by caller 03059 tid_t& tid, 03060 timeout_in_ms timeout); 03061 03062 static rc_t _commit_xct( 03063 sm_stats_info_t*& stats, 03064 bool lazy, 03065 lsn_t* plastlsn); 03066 03067 static rc_t _prepare_xct( 03068 sm_stats_info_t*& stats, 03069 vote_t& v); 03070 03071 static rc_t _set_coordinator(const server_handle_t &); 03072 03073 static rc_t _enter_2pc(const gtid_t &); 03074 static rc_t _force_vote_readonly(); 03075 static rc_t _recover_2pc(const gtid_t &,// in 03076 bool mayblock, 03077 tid_t & //out -- attached if found(?) 03078 ); 03079 static rc_t _chain_xct( 03080 sm_stats_info_t*& stats, 03081 bool lazy); 03082 03083 static rc_t _abort_xct( 03084 sm_stats_info_t*& stats); 03085 03086 static rc_t _save_work(sm_save_point_t& sp); 03087 03088 static rc_t _rollback_work(const sm_save_point_t& sp); 03089 static rc_t _mount_dev( 03090 const char* device, 03091 u_int& vol_cnt, 03092 vid_t local_vid); 03093 03094 static rc_t _dismount_dev( 03095 const char* device, 03096 bool dismount_if_locked = true 03097 ); 03098 static rc_t _create_vol( 03099 const char* device_name, 03100 const lvid_t& lvid, 03101 smksize_t quota_KB, 03102 bool skip_raw_init, 03103 const bool apply_fake_io_latency, 03104 const int fake_disk_latency); 03105 03106 static rc_t _create_index( 03107 vid_t vid, 03108 ndx_t ntype, 03109 store_property_t property, 03110 const char* key_desc, 03111 concurrency_t cc, 03112 stid_t& stid 03113 ); 03114 03115 static rc_t _destroy_index(const stid_t& iid); 03116 03117 static rc_t _get_store_info( 03118 const stid_t & stid, 03119 sm_store_info_t& info); 03120 03121 static rc_t _bulkld_index( 03122 const stid_t& stid, 03123 int nsrcs, 03124 const stid_t* source, 03125 sm_du_stats_t& stats, 03126 bool sort_duplicates = true, 03127 bool lexify_keys = true 03128 ); 03129 03130 static rc_t _bulkld_index( 03131 const stid_t& stid, 03132 sort_stream_i& sorted_stream, 03133 sm_du_stats_t& stats 03134 ); 03135 03136 static rc_t _print_index(const stid_t &iid); 03137 03138 static rc_t _create_assoc( 03139 const stid_t & stid, 03140 const vec_t& key, 03141 const vec_t& el 03142 #ifdef SM_DORA 03143 , const bool bIgnoreLocks = false 03144 #endif 03145 ); 03146 03147 static rc_t _destroy_assoc( 03148 const stid_t & stid, 03149 const vec_t& key, 03150 const vec_t& el 03151 #ifdef SM_DORA 03152 , const bool bIgnoreLocks = false 03153 #endif 03154 ); 03155 03156 static rc_t _destroy_all_assoc( 03157 const stid_t& stid, 03158 const vec_t& key, 03159 int& num_removed 03160 ); 03161 static rc_t _find_assoc( 03162 const stid_t& stid, 03163 const vec_t& key, 03164 void* el, 03165 smsize_t& elen, 03166 bool& found 03167 #ifdef SM_DORA 03168 , const bool bIgnoreLocks = false 03169 #endif 03170 ); 03171 03172 // below method overloaded for rtree 03173 static rc_t _create_md_index( 03174 vid_t vid, 03175 ndx_t ntype, 03176 store_property_t property, 03177 stid_t& stid, 03178 int2_t dim=2 03179 ); 03180 03181 static rc_t _destroy_md_index(const stid_t& iid); 03182 03183 static rc_t _destroy_md_assoc( 03184 stid_t stid, 03185 const nbox_t& key, 03186 const vec_t& el); 03187 03188 static rc_t _bulkld_md_index( 03189 const stid_t& stid, 03190 int nsrcs, 03191 const stid_t* source, 03192 sm_du_stats_t& stats, 03193 int2_t hff, // for rtree only 03194 int2_t hef, // for rtree only 03195 nbox_t* universe);// for rtree only 03196 03197 static rc_t _bulkld_md_index( 03198 const stid_t& stid, 03199 sort_stream_i& sorted_stream, 03200 sm_du_stats_t& stats, 03201 int2_t hff, // for rtree only 03202 int2_t hef, // for rtree only 03203 nbox_t* universe);// for rtree only 03204 03205 static rc_t _print_md_index(stid_t stid); 03206 03207 static rc_t _create_md_assoc( 03208 stid_t stid, 03209 const nbox_t& key, 03210 const vec_t& el); 03211 03212 static rc_t _find_md_assoc( 03213 stid_t stid, 03214 const nbox_t& key, 03215 void* el, 03216 smsize_t& elen, 03217 bool& found); 03218 03219 // 03220 // The following functions deal with files of records. 03221 // 03222 static rc_t _destroy_n_swap_file( 03223 const stid_t& old_fid, 03224 const stid_t& new_fid); 03225 03226 static rc_t _create_file( 03227 vid_t vid, 03228 stid_t& fid, 03229 store_property_t property, 03230 shpid_t cluster_hint = 0 03231 ); 03232 03233 static rc_t _destroy_file(const stid_t& fid); 03234 03235 static rc_t _create_rec( 03236 const stid_t& fid, 03237 const vec_t& hdr, 03238 smsize_t len_hint, 03239 const vec_t& data, 03240 rid_t& new_rid 03241 #ifdef SM_DORA 03242 , const bool bIgnoreLocks = false 03243 #endif 03244 ); 03245 03246 static rc_t _destroy_rec( 03247 const rid_t& rid 03248 #ifdef SM_DORA 03249 , const bool bIgnoreLocks = false 03250 #endif 03251 ); 03252 03253 static rc_t _update_rec( 03254 const rid_t& rid, 03255 smsize_t start, 03256 const vec_t& data 03257 #ifdef SM_DORA 03258 , const bool bIgnoreLocks = false 03259 #endif 03260 ); 03261 03262 static rc_t _update_rec_hdr( 03263 const rid_t& rid, 03264 smsize_t start, 03265 const vec_t& hdr 03266 #ifdef SM_DORA 03267 , const bool bIgnoreLocks = false 03268 #endif 03269 ); 03270 03271 static rc_t _append_rec( 03272 const rid_t& rid, 03273 const vec_t& data 03274 ); 03275 03276 static rc_t _truncate_rec( 03277 const rid_t& rid, 03278 smsize_t amount, 03279 bool& should_forward 03280 ); 03281 03282 static rc_t _draw_rtree(const stid_t& stid, ostream &); 03283 03284 static rc_t _rtree_stats( 03285 const stid_t& stid, 03286 rtree_stats_t& stat, 03287 uint2_t size, 03288 uint2_t* ovp, 03289 bool audit 03290 ); 03291 03292 #ifdef OLDSORT_COMPATIBILITY 03293 /* old sort internal, physical */ 03294 static rc_t _sort_file( 03295 const stid_t& fid, 03296 vid_t vid, 03297 stid_t& sfid, 03298 store_property_t property, 03299 const key_info_t& key_info, 03300 int run_size, 03301 bool ascending, 03302 bool unique, 03303 bool destructive 03304 ); 03305 #endif /* OLDSORT_COMPATIBILITY */ 03306 03307 /* new sort internal, physical */ 03308 static rc_t _sort_file( 03309 const stid_t& fid, // input file 03310 const stid_t& sorted_fid, // output file -- 03311 // created by caller-- 03312 // can be same as input file 03313 int nvids, // array size for vids 03314 const vid_t* vid, // array of vids for temp 03315 sort_keys_t& kl, // key location info & 03316 smsize_t min_rec_sz, // for estimating space use 03317 int run_size, // # pages to use for a run 03318 int temp_space //# pages VM to use for scratch 03319 ); 03320 03321 03322 #ifdef OLDSORT_COMPATIBILITY 03323 /* internal compatibility old sort-> new sort */ 03324 static rc_t _new_sort_file( 03325 const stid_t& in_fid, 03326 const stid_t& out_fid, 03327 const key_info_t& ki, 03328 int run_size, 03329 bool ascending, 03330 bool unique, 03331 bool keep_orig //!destructive 03332 ); 03333 #endif /* OLDSORT_COMPATIBILITY */ 03334 03335 static store_flag_t _make_store_flag(store_property_t property); 03336 // reverse function: 03337 // static store_property_t _make_store_property(w_base_t::uint4_t flag); 03338 // is in dir_vol_m 03339 03340 // this is for df statistics DU DF 03341 static rc_t _get_du_statistics( 03342 vid_t vid, 03343 sm_du_stats_t& du, 03344 bool audit); 03345 03346 static rc_t _get_du_statistics( 03347 const stid_t & stid, 03348 sm_du_stats_t& du, 03349 bool audit); 03350 03351 static rc_t _get_volume_meta_stats( 03352 vid_t vid, 03353 SmVolumeMetaStats& volume_stats, 03354 concurrency_t cc); 03355 03356 static rc_t _get_file_meta_stats( 03357 vid_t vid, 03358 w_base_t::uint4_t num_files, 03359 SmFileMetaStats* file_stats, 03360 bool batch_calculate, 03361 concurrency_t cc); 03362 }; 03363 03364 /**\brief Information about a store that can be queried by the client. 03365 * \details 03366 * This information is stored in a store directory on the volume. 03367 * It can be queried with ss_m::get_store_info. 03368 */ 03369 class sm_store_info_t { 03370 public: 03371 NORET sm_store_info_t(int len) : 03372 store(0), stype(ss_m::t_bad_store_t), 03373 ntype(ss_m::t_bad_ndx_t), cc(ss_m::t_cc_bad), 03374 eff(0), large_store(0), root(0), 03375 nkc(0), keydescrlen(len) 03376 { keydescr = new char[len]; } 03377 03378 NORET ~sm_store_info_t() { if (keydescr) delete[] keydescr; } 03379 03380 /// store number 03381 snum_t store; 03382 /// t_index, t_file, ... See ss_m::store_t. 03383 u_char stype; 03384 /// t_btree, t_rtree,... See ss_m::ndx_t 03385 u_char ntype; 03386 /// t_cc_kvl, t_cc_record,... See ss_m::concurrency_t 03387 u_char cc; 03388 03389 /// Unused: 03390 u_char eff; 03391 03392 /// Store number for associated large-page store, if there is one. 03393 snum_t large_store; 03394 /// Root page if this is an index. 03395 shpid_t root; 03396 /// Number of key components if this is an index. 03397 w_base_t::uint4_t nkc; 03398 /// Size of key description (if this is an index) 03399 int keydescrlen; 03400 /**\brief Variable length string. 03401 * 03402 * He who creates a sm_store_info_t for use with get_store_info() 03403 * is responsible for allocating enough space for 03404 * key descriptors if he expects to find them. 03405 * See \ref key_description. 03406 */ 03407 char *keydescr; 03408 }; 03409 03410 03411 ostream& operator<<(ostream& o, const vid_t& v); 03412 istream& operator>>(istream& i, vid_t& v); 03413 ostream& operator<<(ostream& o, const extid_t& x); 03414 istream& operator>>(istream& o, extid_t &x); 03415 ostream& operator<<(ostream& o, const stid_t& stid); 03416 istream& operator>>(istream& i, stid_t& stid); 03417 ostream& operator<<(ostream& o, const lpid_t& pid); 03418 istream& operator>>(istream& i, lpid_t& pid); 03419 ostream& operator<<(ostream& o, const shrid_t& r); 03420 istream& operator>>(istream& i, shrid_t& r); 03421 ostream& operator<<(ostream& o, const rid_t& rid); 03422 istream& operator>>(istream& i, rid_t& rid); 03423 ostream& operator<<(ostream& o, const sm_stats_info_t& s); 03424 template<class ostream> 03425 ostream& operator<<(ostream& o, const sm_config_info_t& s) 03426 { 03427 o << " page_size " << s.page_size 03428 << " max_small_rec " << s.max_small_rec 03429 << " lg_rec_page_space " << s.lg_rec_page_space 03430 << " buffer_pool_size " << s.buffer_pool_size 03431 << " max_btree_entry_size " << s.max_btree_entry_size 03432 << " exts_on_page " << s.exts_on_page 03433 << " pages_per_ext " << s.pages_per_ext 03434 << " multi_threaded_xct " << s.multi_threaded_xct 03435 << " logging " << s.logging 03436 ; 03437 return o; 03438 } 03439 03440 03441 #ifndef VEC_T_H 03442 #include <vec_t.h> 03443 #endif 03444 03445 #ifndef SM_ESCALATION_H 03446 #include <sm_escalation.h> 03447 #endif 03448 03449 /*<std-footer incl-file-exclusion='SM_H'> -- do not edit anything below this line -- */ 03450 03451 #endif /*</std-footer>*/