BadgerDB
|
00001 00008 #pragma once 00009 00010 #include <iostream> 00011 #include <string> 00012 #include "string.h" 00013 #include <sstream> 00014 #include <list> 00015 #include <algorithm> 00016 #include <deque> 00017 #include <stack> 00018 00019 #include "types.h" 00020 #include "page.h" 00021 #include "file.h" 00022 #include "buffer.h" 00023 00024 namespace badgerdb 00025 { 00026 00027 #define MAXKEYSIZE 12 //why 12?????????? 00028 00029 //constants used by the BTreeIndex class that are calculated 00030 //based on the type of data being used to create the index 00031 const int STRINGSIZE = 10; 00032 00033 // sibling ptr key rid 00034 const int INTARRAYLEAFSIZE = ( Page::SIZE - sizeof( PageId ) ) / ( sizeof( int ) + sizeof( RecordId ) ); 00035 const int DOUBLEARRAYLEAFSIZE = ( Page::SIZE - sizeof( PageId ) ) / ( sizeof( double ) + sizeof( RecordId ) ); 00036 const int STRINGARRAYLEAFSIZE = ( Page::SIZE - sizeof( PageId ) ) / ( 10 * sizeof(char) + sizeof( RecordId ) ); 00037 // level extra page pointer key page pointer 00038 const int INTARRAYNONLEAFSIZE = ( Page::SIZE - sizeof( int ) - sizeof( PageId ) ) / ( sizeof( int ) + sizeof( PageId ) ); 00039 const int DOUBLEARRAYNONLEAFSIZE = (( Page::SIZE - sizeof( int ) - sizeof( PageId ) ) / ( sizeof( double ) + sizeof( PageId ) ))-1; 00040 const int STRINGARRAYNONLEAFSIZE = ( Page::SIZE - sizeof( int ) - sizeof( PageId ) ) / ( 10 * sizeof(char) + sizeof( PageId ) ); 00041 00042 /* 00043 Structure to store a key-rid pair. 00044 It is used to pass the pair to functions that 00045 add to or make changes to the leaf node pages of the tree. 00046 Is templated for the key member. 00047 */ 00048 template <class T> 00049 class RIDKeyPair{ 00050 public: 00051 RecordId rid; 00052 T key; 00053 void set( RecordId r, T k) 00054 { 00055 rid = r; 00056 key = k; 00057 } 00058 }; 00059 00060 /* 00061 Structure to store a key page pair which is used to pass the key and page to functions that make 00062 any modifications to the non leaf pages of the tree. 00063 */ 00064 template <class T> 00065 class PageKeyPair{ 00066 public: 00067 PageId pageNo; 00068 T key; 00069 void set( int p, T k) 00070 { 00071 pageNo = p; 00072 key = k; 00073 } 00074 }; 00075 00076 /* 00077 Overloaded operator to compare the key values of two rid-key pairs 00078 and if they are the same compares to see if the first pair has 00079 a smaller rid.pageNo value. 00080 */ 00081 template <class T> 00082 bool operator<( const RIDKeyPair<T>& r1, const RIDKeyPair<T>& r2 ) 00083 { 00084 if( r1.key != r2.key ) 00085 return r1.key < r2.key; 00086 else 00087 return r1.rid.page_number < r2.rid.page_number; 00088 } 00089 00090 /* 00091 The meta page is always page 1 of the btree index file and is cast 00092 to the following structure to store or retrieve information from it. 00093 Contains the relation name for which the index is created, the byte offset 00094 of the key value on which the index is made, the type of the key and the page no 00095 of the root page. Root page starts as page 2 but since a split can occur 00096 at the root the root page may get moved up and get a new page no. 00097 */ 00098 struct IndexMetaInfo{ 00099 char relationName[20]; 00100 int attrByteOffset; 00101 Datatype attrType; 00102 PageId rootPageNo; 00103 }; 00104 00105 /* 00106 Each node is a page, so once we read the page in we just cast 00107 the pointer to the page to this struct and use it to access the parts 00108 These structures basically are the format in which the information 00109 is stored in the pages for the index file depending on what kind of 00110 node they are. 00111 The level memeber of each non leaf structure seen below is set to 1 if the 00112 nodes at this level are just above the leaf nodes. Otherwise set to 0. 00113 */ 00114 00115 /* 00116 Structure for all non leaf nodes that have the key value as int. 00117 Used to place information into a leaf node and then retrieve it 00118 as well. 00119 */ 00120 struct NonLeafNodeInt{ 00121 int level; 00122 int keyArray[ INTARRAYNONLEAFSIZE ]; 00123 PageId pageNoArray[ INTARRAYNONLEAFSIZE + 1 ]; 00124 }; 00125 00126 /* 00127 Structure for all non leaf nodes that have the key value as double. 00128 Used to place information into a leaf node and then retrieve it 00129 as well. 00130 */ 00131 struct NonLeafNodeDouble{ 00132 int level; 00133 double keyArray[ DOUBLEARRAYNONLEAFSIZE ]; 00134 PageId pageNoArray[ DOUBLEARRAYNONLEAFSIZE + 1 ]; 00135 }; 00136 00137 /* 00138 Structure for all non leaf nodes that have the key value as string. 00139 Used to place information into a leaf node and then retrieve it 00140 as well. 00141 */ 00142 struct NonLeafNodeString{ 00143 00144 int level; 00145 char keyArray[ STRINGARRAYNONLEAFSIZE ][ STRINGSIZE ]; 00146 PageId pageNoArray[ STRINGARRAYNONLEAFSIZE + 1 ]; 00147 }; 00148 00149 /* 00150 Structure for all leaf nodes that have the key value as int. 00151 Used to place information into a leaf node and then retrieve it 00152 as well. 00153 */ 00154 struct LeafNodeInt{ 00155 int keyArray[ INTARRAYLEAFSIZE ]; 00156 RecordId ridArray[ INTARRAYLEAFSIZE ]; 00157 PageId rightSibPageNo; 00158 }; 00159 00160 /* 00161 Structure for all leaf nodes that have the key value as double. 00162 Used to place information into a leaf node and then retrieve it 00163 as well. 00164 */ 00165 struct LeafNodeDouble{ 00166 double keyArray[ DOUBLEARRAYLEAFSIZE ]; 00167 RecordId ridArray[ DOUBLEARRAYLEAFSIZE ]; 00168 PageId rightSibPageNo; 00169 }; 00170 00171 /* 00172 Structure for all leaf nodes that have the key value as string. 00173 Used to place information into a leaf node and then retrieve it 00174 as well. 00175 */ 00176 struct LeafNodeString{ 00177 char keyArray[ STRINGARRAYLEAFSIZE ][ STRINGSIZE ]; 00178 RecordId ridArray[ STRINGARRAYLEAFSIZE ]; 00179 PageId rightSibPageNo; 00180 }; 00181 00182 // ----------------------------------------------------------------------------- 00183 // BTreeIndex 00184 // A class which implements a B+ Tree index on a single attribute of one 00185 // relation. This index supports only one scan at a time. 00186 // ----------------------------------------------------------------------------- 00187 class BTreeIndex { 00188 00189 private: 00190 00191 File *file; // file object for this index 00192 BufMgr *bufMgr; //buffer manager instance 00193 PageId headerPageNum; // number of header page in file 00194 PageId rootPageNum; // page number of root page 00195 Datatype attributeType; 00196 int leafOccupancy; // maximum size of leaf node 00197 int nodeOccupancy; // maximum size of non-leaf node 00198 00199 // Members specific to scanning 00200 00201 bool scanExecuting; // is an index scan currently executing? 00202 int nextEntry; // offset of the next entry to be scanned 00203 PageId currentPageNum; // page number of pinned page 00204 Page* currentPageData; 00205 int lowValInt; //variable that define the range based on which 00206 double lowValDouble; //type of index is being used for the scan 00207 std::string lowValString; 00208 int highValInt; 00209 double highValDouble; 00210 std::string highValString; 00211 00212 Operator lowOp; // GT (>) or GTE (>=) 00213 char highVal[MAXKEYSIZE]; // comparison value of filter 00214 Operator highOp; // LT (<) or LTE (<=) 00215 00216 //other meta info read in from page 1 of the file 00217 int attrByteOffset; 00218 00219 //constructor helper functions// 00220 /* 00221 This function is called by the actual btreeindex constructor. It reads the 00222 data from the actual heap file using HeapFileScan, stores the key rid 00223 pairs into a deque and then sorts it. After sorting the data it calls 00224 the fillLeafNodes to create all the required leaf nodes of the index and 00225 then calls the fillNonLeaf nodes till the it reached the point at which 00226 it has created the root 00227 It also creates the meta page and stores all the required information for the 00228 index being created. 00229 Uses BULK LOADING ALGORITHM 00230 Parameters- 00231 relationName : name of relation for which tree is being created 00232 outIndexName : name of index file that is being created 00233 attrByteOffset: offset of record that is the key for the index 00234 attrType: attribute type 00235 00236 status: to report any errors is set and can be used by calling function 00237 */ 00238 template <class T> 00239 void BTreeTemplateConstructor(const std::string & relationName, 00240 std::string & outIndexName, 00241 const int attrByteOffset, 00242 const Datatype attrType); 00243 00244 /* 00245 This function is called by the function above, it basically goes through the 00246 dataList deque which contains all the data entries and starts 00247 placing them in leafNodes. It allocates space for nodes and fills them up with the 00248 data entries. While doing so it fills the the thisLevelNodes deque placing 00249 the first data entry that was into each leaf node being created. 00250 The nodes are filled up based on the number of entries we have calculated 00251 each node as being able to contain. We only fill the nodes upto half the 00252 capacity calculated. 00253 Parameters- 00254 thisLevelNodes: is set by this function to tell the calling function 00255 which values need to placed in the levels above 00256 for the balanced tree to be created 00257 datalist: all the key values and rids for the records to be added to the leaf level 00258 Return Value: returns last page number assigned 00259 */ 00260 template <class T> 00261 int fillLeafNodes( 00262 std::deque< PageKeyPair< T > > &thisLevelNodes, 00263 std::deque< RIDKeyPair< T > > &dataList); 00264 00265 /* 00266 Once the leaf nodes are created, this function is called in a loop. It takes each 00267 entry in the thisLevelNodes, and places it into a nonLeaf node. Once the nonLeafnode 00268 is half full, it places the next entry into the lastLevelNodes deque, creates another 00269 nonLead node and starts filling it up until its half full and keeps going on like this. 00270 After it has been through the whole thisLevelNodes deque it set the thisLevelNodes deque 00271 equal to the lastLevelNodes deque so that the next time fillNonLeafNodes is called in 00272 the while loop of the bulk loading algorithm it uses that as we are going to be creating 00273 the level right above the one just created. 00274 Parameters- 00275 thisLevelNodes- indicates all the entries that were not placed 00276 in this level but will be placed into levels above 00277 lastLevelNodes- indicates all entries that go into this level or levels 00278 above 00279 level- indicates the level being populated 00280 status - set by the function and can be read by the calling function 00281 00282 */ 00283 template <class T> 00284 int fillNonLeafNodes( 00285 std::deque< PageKeyPair< T > > &thisLevelNodes, 00286 std::deque< PageKeyPair< T > > &lastLevelNodes, 00287 const int level); 00288 00289 /* 00290 This functions helps copy the key and rid from a record into a RIDKeyPair object 00291 based on the type of the data being used for the current index. 00292 Parameters- 00293 rec: record whose key pair is to be extracted 00294 rid: the rid for this record 00295 attrByteOffset: the offset of the key value in the record 00296 Return Value: returns the ridKey pair 00297 */ 00298 template <class T> 00299 RIDKeyPair<T> copyRecordToRidKeyPair( std::string rec, RecordId rid, const int attrByteOffset ); 00300 00301 //Scan Helper Functions// 00302 /* 00303 Starts a scan for the user. If a previous scan is still running, ends it and starts the 00304 new scan. It first checks to make sure that the op values used are allowed and correct. It 00305 then goes down the index tree looking for the lower bound specified and finds the leaf node 00306 page it should be in. Then the functions parses the node till entries are smaller than the lower 00307 bound specified. If this value is not found in the page, we know the lowest value for this scan is the first 00308 value of the next page. Next, the range parameters are checked based on which the we set the nextEntry value. 00309 The nextentry values is moved up using the scanNext function. 00310 Return Value: returns the status 00311 */ 00312 template <class T> 00313 const void templateStartScan(); 00314 00315 /* 00316 Basically checks the next entry in the leaf Node we are at and if we are at the end of the node 00317 checks the right sibling pointer and sets the next entry to the first entry in that page. It also 00318 checks to make sure we are still in the range specified by scanStart. 00319 */ 00320 template <class T> 00321 const void templateScanNext(RecordId& outRid); // returned record id 00322 00323 //Insert Helper Functions// 00324 /* 00325 called by the insertEntry function based on the type of tree being inserted into setting 00326 the appropriate size parameters and is always sent the root page number the first time. 00327 It starts at the root and goes down the tree along the path based on the value being inserted. 00328 Once it gets to the right leaf node page, if it can insert it there it does so, and if it is full 00329 it splits it and places the keyrid pair into either the old page that was halved or the new page 00330 depending on where it belongs and then this split is propogated back up each level. When it 00331 returns from the insertInto call that inserted the keyrid into the leaf it takes the first entry 00332 of the new page that was created because of the split and places it into the non leaf node. And again 00333 does a split if needed. This goes all the way upto the root. If the root is split, we the meta data 00334 is changed and this function handles that by writing the new information to the metapage. 00335 Parameters- 00336 pageNo: next page number of the path being followed along the tree for insert 00337 key: key value being inserted 00338 rid: rid value for the record whose key is being inserted 00339 treePath: stack to indicated what the last level that was inserted is 00340 Return Value- returns the status after insert 00341 */ 00342 template <class T> 00343 void insertInto(PageId pageNo, T key, PageKeyPair<T> &newPageKeyPair, 00344 RecordId &rid, std::stack<int> &treePath); 00345 00346 /* 00347 Finds the next page in the path down the index based on the key value we are trying to look 00348 for in the leaf nodes. 00349 Parameters- 00350 currentNode: pointer to the page being looked at to determing the next page in the path 00351 Return Value- pageNo for the path to be followed down the tree 00352 */ 00353 template <class T> 00354 PageId findPageNoInNonLeaf( Page *currentNode, T key ); 00355 00356 /* 00357 Called to place a key page pair into a non leaf node if its not full. First checks to see 00358 where it should go based on the sort order, then shifts everything over one and places 00359 the key page pair into the appropriate position. 00360 Parameters- 00361 currentNode: page into which the keypage pair is being inserted 00362 keyPagePair: key and page that need to be inserted 00363 lastIndexUsed:index of last valid entry in the page 00364 */ 00365 template <class T> 00366 void insertPageKeyPair( Page* currentNode, PageKeyPair<T> keyPagePair, int lastIndexUsed ); 00367 00368 /* 00369 Same as above except for the fact that it is called for leaf pages instead and places 00370 the key rid pair into the node 00371 Parameters- 00372 currentNode:node into which the key rid need to be inserted 00373 RIDKeyPair: pair of key and rid that need to be inserted into the node 00374 lastIndexUsed: last index that is valid in the node 00375 */ 00376 template <class T> 00377 void insertKeyRidPair( Page* currentNode, RIDKeyPair<T> keyRidPair, int lastIndexUsed ); 00378 00379 /* 00380 If a non leaf node is full, this function is called. It keeps the first half in the original 00381 node and takes the rest of the values and places it into the newly created node while zeroing 00382 out the entries from the original node and then also makes sure it zeros out the entries in the new 00383 nodes that are not used. 00384 Parameters- 00385 currentNode:node that needs to be split 00386 newNode:page ptr to the new node into which half the current node values will be placed 00387 */ 00388 template <class T> 00389 void splitNonLeafNode(Page* currentNode, Page* newNode); 00390 00391 /* 00392 Same as function above for the leaf nodes. 00393 Parameters- 00394 currentNode: node that needs to split 00395 newLeafNode: node ptr to place the second half of currentNode into 00396 newNodeIndexNotUsed:first index that is invalid in the node 00397 */ 00398 template <class T> 00399 void splitLeafNode(Page* currentNode, Page* newLeafNode, int &newNodeIndexNotUsed); 00400 00401 /* 00402 Indicates whether the non leaf node is full and if its not tells the calling function what the index is 00403 of the last entry that has valid data. We check the page no and if it is 0 that means the data is 00404 valid upto the page entry before that. 00405 Parameters- 00406 curentNode: node being checked for free space 00407 lastIndexUsed: index of last valid entry in the node 00408 */ 00409 void isNonLeafFull( Page* currentNode, int &lastIndexUsed); 00410 00411 /* 00412 Same as function above for leaf nodes. 00413 Parameters- 00414 currentNode:node being checked for free space 00415 lastIndexUsed: index of last valid entry of the node 00416 */ 00417 void isLeafFull( Page* currentNode, int &lastIndexUsed); 00418 00419 /* 00420 Function to set the right high and low val members of the class object based on the type 00421 of index being scanned 00422 Parameters- 00423 val- value to set the high/low val variable to 00424 high- if true set the high val, else set the low val 00425 */ 00426 template <class T> 00427 void setHighLowVal(const void* val, bool high); 00428 00429 /* 00430 Function to get the righr high and low val members of the class object based on the type of the 00431 index being scanned. 00432 Parameters- 00433 high:if true get the high val, else get the low val 00434 */ 00435 template <class T> 00436 T getHighLowVal(bool high); 00437 00438 /* 00439 Function to copy the key value to the appropriate index of a non leaf or leaf node. 00440 It takes the node and casts it to the right type of structure so that we can get 00441 access to the right part of the node that we want to write to. 00442 Parameters- 00443 key:key to copy into the index of the node 00444 nodePagePtr:node into which key needs to be copied 00445 index: index into which the key needs to be copied 00446 leaf:whether the node being inserted into is a leaf node or not 00447 */ 00448 template <class T> 00449 void copyKey(T key, Page *nodePagePtr , int index, bool leaf); 00450 00451 /* 00452 Gets the key values stored at the index specified for a leaf or non leaf node. 00453 Return Value- returns the key value extracted 00454 */ 00455 template <class T> 00456 T extractKey(T &value, Page *nodePagePtr , int index, bool leaf); 00457 00458 /* 00459 Gets the page no from the appropriate index of a non leaf page. 00460 Parameters- 00461 pageNo-value of the pageNo that is at the index specified in the node 00462 nodePagePtr:node from which the pageNo needs to be extracted 00463 index:index of entry being extracted 00464 Return Value- returns the pageNo extracted too 00465 */ 00466 PageId extractPageNo(PageId &pageNo, Page *nodePagePtr, int index); 00467 00468 /* 00469 Copies the page no into the appropriate index of a non leaf page. 00470 Parameters- 00471 pageNo:pageno to be placed in the node 00472 nodePagePtr:node into which the pageno needs to be inserted 00473 index: index at which the pageno needs to be inserted 00474 */ 00475 void copyPageNo(PageId pageNo, Page *nodePagePtr, int index); 00476 00477 /* 00478 Extracts the level from the non leaf page. 00479 Parameters- 00480 pageNo:set to the level of the node 00481 nodePagePtr:node whose level is needed 00482 Return Value-returns the level of the node 00483 */ 00484 int extractLevel(int &level, Page *nodePagePtr); 00485 00486 /* 00487 Sets the level value for the non leaf page. 00488 */ 00489 void copyLevel( int level, Page *nodePagePtr ); 00490 00491 /* 00492 Sets the rid of the appropriate index of the leaf node. 00493 */ 00494 void copyRid(RecordId rid, Page *nodePagePtr, int index); 00495 00496 /* 00497 Extracts the rid from the appropriateindex of a leaf page. 00498 */ 00499 RecordId extractRid(RecordId &rid, Page *nodePagePtr, int index); 00500 00501 /* 00502 Set the right sib ptr of a leaf node. 00503 */ 00504 void copySibPtr(PageId pageNo, Page *nodePagePtr); 00505 00506 /* 00507 Get the right sib ptr of a leaf node. 00508 */ 00509 PageId extractSibPtr(PageId &pageNo, Page *nodePagePtr); 00510 00511 /* 00512 Zero out everything in the node. 00513 */ 00514 void zeroOutNonLeaf(Page *currentNode); 00515 00516 /* 00517 Zero out everything in the node. 00518 */ 00519 void zeroOutLeaf(Page *currentNode); 00520 00521 /* 00522 Debug help function to print the state of the tree at any given point. 00523 */ 00524 template <class T> 00525 void printTree(PageId pageNum , bool isLeaf); 00526 00527 public: 00528 00529 // --------------------------------------------------------------------------- 00530 // Public interface methods 00531 // --------------------------------------------------------------------------- 00532 00533 // Check to see if the specified index file exists. If so, open the file. 00534 // If not, create it and insert an entry for every tuple in the base relation 00535 // Keep the header page pinned in memory. 00536 00537 BTreeIndex(const std::string & relationName, // name of relation file to index 00538 std::string & outIndexName, // name of index file 00539 BufMgr *bufMgrIn, //buffer manager instance 00540 const int attrByteOffset, // offset to attrtibute in tuple 00541 const Datatype attrType); 00542 00543 00544 // Unpin the header page and close the file. (do not delete the index file) 00545 00546 ~BTreeIndex(); 00547 00548 // Insert a new entry using the pair <value,rid>. Check for unique values. 00549 00550 const void insertEntry(const void* value, // value to insert 00551 const RecordId rid); // corresponding tuple rid 00552 00553 // Begin a filtered scan of the index. For example, if the method is called 00554 // using ("a",GT,"d",LTE) then we should seek all entries with a value 00555 // greater than "a" and less than or equal to "d". 00556 00557 const void startScan(const void* lowVal, // low value of range 00558 const Operator lowOp, // operation (GT, GTE) 00559 const void* highVal, // high value of range 00560 const Operator highOp); // operation (LT, LTE) 00561 00562 // Fetch the record id of the next index entry that matches the scan. 00563 00564 const void scanNext(RecordId& outRid); // returned record id 00565 00566 // Terminate the current scan 00567 00568 const void endScan(); 00569 00570 /* 00571 Public function to allow for a public interface to print the index B+-Tree 00572 Useful for testindex 00573 */ 00574 void printTreeExternal(); 00575 00576 }; 00577 00578 }