BadgerDB
/afs/cs.wisc.edu/u/n/w/nwilliam/private/workspace/Quut/src/btree.h
00001 
00008 #pragma once
00009 
00010 #include <iostream>
00011 #include <string>
00012 #include "string.h"
00013 #include <sstream>
00014 #include <list>
00015 #include <algorithm>
00016 #include <deque>
00017 #include <stack>
00018 
00019 #include "types.h"
00020 #include "page.h"
00021 #include "file.h"
00022 #include "buffer.h"
00023 
00024 namespace badgerdb
00025 {
00026 
00027 #define MAXKEYSIZE 12 //why 12??????????
00028 
00029 //constants used by the BTreeIndex class that are calculated
00030 //based on the type of data being used to create the index
00031 const  int STRINGSIZE = 10;
00032 
00033 //                                                sibling ptr           key               rid
00034 const  int INTARRAYLEAFSIZE = ( Page::SIZE - sizeof( PageId ) ) / ( sizeof( int ) + sizeof( RecordId ) );
00035 const  int DOUBLEARRAYLEAFSIZE = ( Page::SIZE - sizeof( PageId ) ) / ( sizeof( double ) + sizeof( RecordId ) );
00036 const  int STRINGARRAYLEAFSIZE = ( Page::SIZE - sizeof( PageId ) ) / ( 10 * sizeof(char) + sizeof( RecordId ) );
00037 //                                                    level        extra page pointer        key       page pointer
00038 const  int INTARRAYNONLEAFSIZE = ( Page::SIZE - sizeof( int ) - sizeof( PageId ) ) / ( sizeof( int ) + sizeof( PageId ) );
00039 const  int DOUBLEARRAYNONLEAFSIZE = (( Page::SIZE - sizeof( int ) - sizeof( PageId ) ) / ( sizeof( double ) + sizeof( PageId ) ))-1;
00040 const  int STRINGARRAYNONLEAFSIZE = ( Page::SIZE - sizeof( int ) - sizeof( PageId ) ) / ( 10 * sizeof(char) + sizeof( PageId ) );
00041 
00042 /*
00043 Structure to store a key-rid pair.
00044 It is used to pass the pair to functions that 
00045 add to or make changes to the leaf node pages of the tree.
00046 Is templated for the key member.
00047 */
00048 template <class T>
00049 class RIDKeyPair{
00050 public:
00051   RecordId rid;
00052   T key;
00053   void set( RecordId r, T k)
00054   {
00055     rid = r;
00056     key = k;
00057   }
00058 };
00059 
00060 /*
00061 Structure to store a key page pair which is used to pass the key and page to functions that make 
00062 any modifications to the non leaf pages of the tree.
00063 */
00064 template <class T>
00065 class PageKeyPair{
00066 public:
00067   PageId pageNo;
00068   T key;
00069   void set( int p, T k)
00070   {
00071     pageNo = p;
00072     key = k;
00073   }
00074 };
00075 
00076 /*
00077 Overloaded operator to compare the key values of two rid-key pairs
00078 and if they are the same compares to see if the first pair has
00079 a smaller rid.pageNo value.
00080 */
00081 template <class T>
00082 bool operator<( const RIDKeyPair<T>& r1, const RIDKeyPair<T>& r2 )
00083 {
00084   if( r1.key != r2.key )
00085     return r1.key < r2.key;
00086   else
00087     return r1.rid.page_number < r2.rid.page_number;
00088 }
00089 
00090 /*
00091 The meta page is always page 1 of the btree index file and is cast
00092 to the following structure to store or retrieve information from it.
00093 Contains the relation name for which the index is created, the byte offset
00094 of the key value on which the index is made, the type of the key and the page no
00095 of the root page. Root page starts as page 2 but since a split can occur
00096 at the root the root page may get moved up and get a new page no.
00097 */
00098 struct IndexMetaInfo{
00099   char relationName[20];
00100   int attrByteOffset;
00101   Datatype attrType;
00102   PageId rootPageNo;
00103 };
00104 
00105 /*
00106 Each node is a page, so once we read the page in we just cast
00107 the pointer to the page to this struct and use it to access the parts
00108 These structures basically are the format in which the information
00109 is stored in the pages for the index file depending on what kind of 
00110 node they are.
00111 The level memeber of each non leaf structure seen below is set to 1 if the
00112 nodes at this level are just above the leaf nodes. Otherwise set to 0.
00113 */
00114 
00115 /*
00116 Structure for all non leaf nodes that have the key value as int.
00117 Used to place information into a leaf node and then retrieve it 
00118 as well. 
00119 */
00120 struct NonLeafNodeInt{
00121   int level;  
00122   int keyArray[ INTARRAYNONLEAFSIZE ];
00123   PageId pageNoArray[ INTARRAYNONLEAFSIZE + 1 ];
00124 };
00125 
00126 /*
00127 Structure for all non leaf nodes that have the key value as double.
00128 Used to place information into a leaf node and then retrieve it 
00129 as well.
00130 */
00131 struct NonLeafNodeDouble{
00132   int level;
00133   double keyArray[ DOUBLEARRAYNONLEAFSIZE ];
00134   PageId pageNoArray[ DOUBLEARRAYNONLEAFSIZE + 1 ];
00135 };
00136 
00137 /*
00138 Structure for all non leaf nodes that have the key value as string.
00139 Used to place information into a leaf node and then retrieve it 
00140 as well.
00141 */
00142 struct NonLeafNodeString{
00143 
00144   int level;
00145   char keyArray[ STRINGARRAYNONLEAFSIZE ][ STRINGSIZE ];
00146   PageId pageNoArray[ STRINGARRAYNONLEAFSIZE + 1 ];
00147 };
00148 
00149 /*
00150 Structure for all leaf nodes that have the key value as int.
00151 Used to place information into a leaf node and then retrieve it 
00152 as well. 
00153 */
00154 struct LeafNodeInt{
00155   int keyArray[ INTARRAYLEAFSIZE ];
00156   RecordId ridArray[ INTARRAYLEAFSIZE ];
00157   PageId rightSibPageNo;
00158 };
00159 
00160 /*
00161 Structure for all leaf nodes that have the key value as double.
00162 Used to place information into a leaf node and then retrieve it 
00163 as well. 
00164 */
00165 struct LeafNodeDouble{
00166   double keyArray[ DOUBLEARRAYLEAFSIZE ];
00167   RecordId ridArray[ DOUBLEARRAYLEAFSIZE ];
00168   PageId rightSibPageNo;
00169 };
00170 
00171 /*
00172 Structure for all leaf nodes that have the key value as string.
00173 Used to place information into a leaf node and then retrieve it 
00174 as well. 
00175 */
00176 struct LeafNodeString{
00177   char keyArray[ STRINGARRAYLEAFSIZE ][ STRINGSIZE ];
00178   RecordId ridArray[ STRINGARRAYLEAFSIZE ];
00179   PageId rightSibPageNo;
00180 };
00181 
00182 // -----------------------------------------------------------------------------
00183 // BTreeIndex
00184 //   A class which implements a B+ Tree index on a single attribute of one 
00185 //   relation.  This index supports only one scan at a time.
00186 // -----------------------------------------------------------------------------
00187 class BTreeIndex {
00188 
00189 private:
00190 
00191   File        *file;           // file object for this index
00192   BufMgr      *bufMgr;        //buffer manager instance
00193   PageId      headerPageNum;  // number of header page in file
00194   PageId      rootPageNum;    // page number of root page
00195   Datatype    attributeType;  
00196   int         leafOccupancy;  // maximum size of leaf node
00197   int         nodeOccupancy;  // maximum size of non-leaf node
00198 
00199 // Members specific to scanning
00200 
00201   bool        scanExecuting;          // is an index scan currently executing?
00202   int         nextEntry;              // offset of the next entry to be scanned
00203   PageId      currentPageNum;         // page number of pinned page
00204   Page*       currentPageData;
00205   int         lowValInt;        //variable that define the range based on which
00206   double      lowValDouble;     //type of index is being used for the scan
00207   std::string lowValString;
00208   int         highValInt;
00209   double      highValDouble;
00210   std::string highValString;
00211   
00212   Operator    lowOp;                  // GT (>) or GTE (>=)
00213   char        highVal[MAXKEYSIZE];    // comparison value of filter
00214   Operator    highOp;                 // LT (<) or LTE (<=)
00215 
00216   //other meta info read in from page 1 of the file 
00217   int attrByteOffset;
00218 
00219   //constructor helper functions//
00220 /*
00221 This function is called by the actual btreeindex constructor. It reads the
00222 data from the actual heap file using HeapFileScan, stores the key rid 
00223 pairs into a deque and then sorts it. After sorting the data it calls 
00224 the fillLeafNodes to create all the required leaf nodes of the index and
00225 then calls the fillNonLeaf nodes till the it reached the point at which
00226 it has created the root 
00227 It also creates the meta page and stores all the required information for the
00228 index being created.
00229 Uses BULK LOADING ALGORITHM 
00230 Parameters- 
00231   relationName : name of relation for which tree is being created
00232         outIndexName : name of index file that is being created
00233   attrByteOffset: offset of record that is the key for the index
00234   attrType: attribute type
00235   
00236   status: to report any errors is set and can be used by calling function
00237 */  
00238   template <class T> 
00239   void BTreeTemplateConstructor(const std::string & relationName,
00240       std::string & outIndexName,
00241       const int attrByteOffset,
00242       const Datatype attrType);
00243   
00244 /*
00245 This function is called by the function above, it basically goes through the
00246 dataList deque which contains all the data entries and starts
00247 placing them in leafNodes. It allocates space for nodes and fills them up with the
00248 data entries. While doing so it fills the the thisLevelNodes deque placing
00249 the first data entry that was into each leaf node being created.
00250 The nodes are filled up based on the number of entries we have calculated
00251 each node as being able to contain. We only fill the nodes upto half the
00252 capacity calculated.
00253 Parameters-
00254   thisLevelNodes: is set by this function to tell the calling function
00255       which values need to placed in the levels above
00256       for the balanced tree to be created
00257   datalist: all the key values and rids for the records to be added to the leaf level
00258 Return Value: returns last page number assigned
00259 */
00260   template <class T>
00261   int fillLeafNodes( 
00262     std::deque< PageKeyPair< T > > &thisLevelNodes,
00263     std::deque< RIDKeyPair< T > > &dataList);
00264   
00265 /*
00266 Once the leaf nodes are created, this function is called in a loop. It takes each
00267 entry in the thisLevelNodes, and places it into a nonLeaf node. Once the nonLeafnode
00268 is half full, it places the next entry into the lastLevelNodes deque, creates another 
00269 nonLead node and starts filling it up until its half full and keeps going on like this.
00270 After it has been through the whole thisLevelNodes deque it set the thisLevelNodes deque
00271 equal to the lastLevelNodes deque so that the next time fillNonLeafNodes is called in
00272 the while loop of the bulk loading algorithm it uses that as we are going to be creating
00273 the level right above the one just created.
00274 Parameters-
00275   thisLevelNodes- indicates all the entries that were not placed
00276       in this level but will be placed into levels above
00277   lastLevelNodes- indicates all entries that go into this level or levels 
00278       above
00279   level- indicates the level being populated
00280   status - set by the function and can be read by the calling function
00281   
00282 */
00283   template <class T>
00284   int fillNonLeafNodes( 
00285     std::deque< PageKeyPair< T > > &thisLevelNodes,
00286     std::deque< PageKeyPair< T > > &lastLevelNodes,
00287     const int level);
00288   
00289 /*
00290 This functions helps copy the key and rid from a record into a RIDKeyPair object 
00291 based on the type of the data being used for the current index.
00292 Parameters- 
00293   rec: record whose key pair is to be extracted
00294   rid: the rid for this record
00295   attrByteOffset: the offset of the key value in the record
00296 Return Value: returns the ridKey pair
00297 */
00298   template <class T>
00299   RIDKeyPair<T> copyRecordToRidKeyPair( std::string rec, RecordId rid, const int attrByteOffset );
00300 
00301   //Scan Helper Functions//
00302 /*
00303 Starts a scan for the user. If a previous scan is still running, ends it and starts the
00304 new scan. It first checks to make sure that the op values used are allowed and correct. It 
00305 then goes down the index tree looking for the lower bound specified and finds the leaf node
00306 page it should be in. Then the functions parses the node till entries are smaller than the lower
00307 bound specified. If this value is not found in the page, we know the lowest value for this scan is the first
00308 value of the next page. Next, the range parameters are checked based on which the we set the nextEntry value.
00309 The nextentry values is moved up using the scanNext function.
00310 Return Value: returns the status 
00311 */
00312   template <class T>
00313   const void templateStartScan();
00314 
00315 /*
00316 Basically checks the next entry in the leaf Node we are at and if we are at the end of the node
00317 checks the right sibling pointer and sets the next entry to the first entry in that page. It also
00318 checks to make sure we are still in the range specified by scanStart. 
00319 */
00320   template <class T>
00321   const void templateScanNext(RecordId& outRid);  // returned record id
00322 
00323   //Insert Helper Functions// 
00324 /*
00325 called by the insertEntry function based on the type of tree being inserted into setting
00326 the appropriate size parameters and is always sent the root page number the first time.
00327 It starts at the root and goes down the tree along the path based on the value being inserted. 
00328 Once it gets to the right leaf node page, if it can insert it there it does so, and if it is full
00329 it splits it and places the keyrid pair into either the old page that was halved or the new page
00330 depending on where it belongs and then this split is propogated back up each level. When it
00331 returns from the insertInto call that inserted the keyrid into the leaf it takes the first entry
00332 of the new page that was created because of the split and places it into the non leaf node. And again
00333 does a split if needed. This goes all the way upto the root. If the root is split, we the meta data
00334 is changed and this function handles that by writing the new information to the metapage. 
00335 Parameters-
00336   pageNo: next page number of the path being followed along the tree for insert
00337   key: key value being inserted
00338   rid: rid value for the record whose key is being inserted
00339   treePath: stack to indicated what the last level that was inserted is
00340 Return Value- returns the status after insert
00341 */
00342   template <class T>
00343   void insertInto(PageId pageNo,  T key, PageKeyPair<T> &newPageKeyPair, 
00344          RecordId &rid, std::stack<int> &treePath);
00345         
00346 /*
00347 Finds the next page in the path down the index based on the key value we are trying to look 
00348 for in the leaf nodes.  
00349 Parameters-
00350   currentNode: pointer to the page being looked at to determing the next page in the path
00351 Return Value- pageNo for the path to be followed down the tree
00352 */
00353   template <class T>
00354   PageId findPageNoInNonLeaf( Page *currentNode, T key );
00355   
00356 /*
00357 Called to place a key page pair into a non leaf node if its not full. First checks to see
00358 where it should go based on the sort order, then shifts everything over one and places
00359 the key page pair into the appropriate position.
00360 Parameters-
00361   currentNode: page into which the keypage pair is being inserted
00362   keyPagePair: key and page that need to be inserted
00363   lastIndexUsed:index of last valid entry in the page
00364 */
00365   template <class T>
00366   void insertPageKeyPair( Page* currentNode, PageKeyPair<T> keyPagePair, int lastIndexUsed );
00367   
00368 /*
00369 Same as above except for the fact that it is called for leaf pages instead and places
00370 the key rid pair into the node
00371 Parameters-
00372   currentNode:node into which the key rid need to be inserted
00373   RIDKeyPair: pair of key and rid that need to be inserted into the node
00374   lastIndexUsed: last index that is valid in the node
00375 */
00376   template <class T>
00377   void insertKeyRidPair( Page* currentNode, RIDKeyPair<T> keyRidPair, int lastIndexUsed );
00378   
00379 /*
00380 If a non leaf node is full, this function is called. It keeps the first half in the original
00381 node and takes the rest of the values and places it into the newly created node while zeroing
00382 out the entries from the original node and then also makes sure it zeros out the entries in the new
00383 nodes that are not used.  
00384 Parameters-
00385   currentNode:node that needs to be split
00386   newNode:page ptr to the new node into which half the current node values will be placed
00387 */
00388   template <class T>
00389   void splitNonLeafNode(Page* currentNode, Page* newNode);
00390   
00391 /*
00392 Same as function above for the leaf nodes.
00393 Parameters-
00394   currentNode: node that needs to split
00395   newLeafNode: node ptr to place the second half of currentNode into
00396   newNodeIndexNotUsed:first index that is invalid in the node 
00397 */
00398   template <class T>
00399   void splitLeafNode(Page* currentNode, Page* newLeafNode, int &newNodeIndexNotUsed);
00400   
00401 /*
00402 Indicates whether the non leaf node is full and if its not tells the calling function what the index is
00403 of the last entry that has valid data. We check the page no and if it is 0 that means the data is 
00404 valid upto the page entry before that.
00405 Parameters-
00406   curentNode: node being checked for free space
00407   lastIndexUsed: index of last valid entry in the node
00408 */
00409   void isNonLeafFull( Page* currentNode, int &lastIndexUsed);
00410   
00411 /*
00412 Same as function above for leaf nodes.  
00413 Parameters-
00414   currentNode:node being checked for free space
00415   lastIndexUsed: index of last valid entry of the node
00416 */
00417   void isLeafFull( Page* currentNode,  int &lastIndexUsed);
00418   
00419 /*
00420 Function to set the right high and low val members of the class object based on the type
00421 of index being scanned  
00422 Parameters-
00423   val- value to set the high/low val variable to
00424   high- if true set the high val, else set the low val
00425 */
00426   template <class T>
00427   void setHighLowVal(const void* val, bool high);
00428   
00429 /*
00430 Function to get the righr high and low val members of the class object based on the type of the
00431 index being scanned.  
00432 Parameters-
00433   high:if true get the high val, else get the low val
00434 */
00435   template <class T>
00436   T getHighLowVal(bool high);
00437   
00438 /*
00439 Function to copy the key value to the appropriate index of a non leaf or leaf node.
00440 It takes the node and casts it to the right type of structure so that we can get
00441 access to the right part of the node that we want to write to.
00442 Parameters-
00443   key:key to copy into the index of the node
00444   nodePagePtr:node into which key needs to be copied
00445   index: index into which the key needs to be copied
00446   leaf:whether the node being inserted into is a leaf node or not
00447 */
00448   template <class T>
00449   void copyKey(T key, Page *nodePagePtr , int index, bool leaf);
00450 
00451 /*
00452 Gets the key values stored at the index specified for a leaf or non leaf node.  
00453 Return Value- returns the key value extracted
00454 */
00455   template <class T>
00456   T extractKey(T &value, Page *nodePagePtr , int index, bool leaf);
00457 
00458 /*
00459 Gets the page no from the appropriate index of a non leaf page. 
00460 Parameters-
00461   pageNo-value of the pageNo that is at the index specified in the node
00462   nodePagePtr:node from which the pageNo needs to be extracted
00463   index:index of entry being extracted
00464 Return Value- returns the pageNo extracted too
00465 */
00466   PageId extractPageNo(PageId &pageNo, Page *nodePagePtr, int index);
00467   
00468 /*
00469 Copies the page no into the appropriate index of a non leaf page. 
00470 Parameters-
00471   pageNo:pageno to be placed in the node
00472   nodePagePtr:node into which the pageno needs to be inserted
00473   index: index at which the pageno needs to be inserted
00474 */
00475   void copyPageNo(PageId pageNo, Page *nodePagePtr, int index);
00476 
00477 /*
00478 Extracts the level from the non leaf page.  
00479 Parameters-
00480   pageNo:set to the level of the node
00481   nodePagePtr:node whose level is needed
00482 Return Value-returns the level of the node
00483 */
00484   int extractLevel(int &level, Page *nodePagePtr);
00485   
00486 /*
00487 Sets the level value for the non leaf page.
00488 */
00489   void copyLevel( int level, Page *nodePagePtr );
00490   
00491 /*
00492 Sets the rid of the appropriate index of the leaf node.
00493 */
00494   void copyRid(RecordId rid, Page *nodePagePtr, int index);
00495   
00496 /*
00497 Extracts the rid from the appropriateindex of a leaf page.  
00498 */
00499   RecordId extractRid(RecordId &rid, Page *nodePagePtr, int index);
00500   
00501 /*
00502 Set the right sib ptr of a leaf node.
00503 */
00504   void copySibPtr(PageId pageNo, Page *nodePagePtr);
00505   
00506 /*
00507 Get the right sib ptr of a leaf node.
00508 */
00509   PageId extractSibPtr(PageId &pageNo, Page *nodePagePtr);
00510   
00511 /*
00512 Zero out everything in the node.
00513 */
00514   void zeroOutNonLeaf(Page *currentNode);
00515 
00516 /*
00517 Zero out everything in the node.
00518 */  
00519 void zeroOutLeaf(Page *currentNode);
00520   
00521 /*
00522 Debug help function to print the state of the tree at any given point.
00523 */
00524   template <class T>
00525   void printTree(PageId pageNum , bool isLeaf);
00526   
00527 public:
00528 
00529 // ---------------------------------------------------------------------------
00530 // Public interface methods
00531 // ---------------------------------------------------------------------------
00532 
00533 // Check to see if the specified index file exists. If so, open the file.
00534 // If not, create it and insert an entry for every tuple in the base relation
00535 // Keep the header page pinned in memory.
00536 
00537   BTreeIndex(const std::string & relationName,      // name of relation file to index
00538     std::string & outIndexName,            // name of index file
00539     BufMgr *bufMgrIn,                     //buffer manager instance
00540     const int attrByteOffset,         // offset to attrtibute in tuple
00541     const Datatype attrType);
00542   
00543 
00544 // Unpin the header page and close the file.  (do not delete the index file)
00545 
00546   ~BTreeIndex();
00547 
00548 // Insert a new entry using the pair <value,rid>.  Check for unique values.
00549 
00550   const void insertEntry(const void* value,  // value to insert
00551       const RecordId rid);     // corresponding tuple rid
00552 
00553 // Begin a filtered scan of the index.  For example, if the method is called 
00554 // using ("a",GT,"d",LTE) then we should seek all entries with a value 
00555 // greater than "a" and less than or equal to "d".
00556 
00557   const void startScan(const void* lowVal,     // low value of range
00558         const Operator lowOp,   // operation (GT, GTE)
00559         const void* highVal,    // high value of range
00560         const Operator highOp); // operation (LT, LTE)
00561 
00562 // Fetch the record id of the next index entry that matches the scan.
00563 
00564   const void scanNext(RecordId& outRid);  // returned record id
00565 
00566 // Terminate the current scan
00567 
00568   const void endScan();
00569   
00570 /*
00571 Public function to allow for a public interface to print the index B+-Tree
00572 Useful for testindex
00573 */
00574   void printTreeExternal();
00575 
00576 };
00577 
00578 }
 All Classes Namespaces Functions Variables Typedefs Enumerations Friends