BadgerDB: /afs/cs.wisc.edu/u/n/w/nwilliam/private/workspace/Quut/src/index.cpp Source File

00001 
00008 #include <sstream>
00009 #include <cstring>
00010 #include <math.h>
00011 #include "index.h"
00012 #include "filescan.h"
00013 #include "exceptions/bad_param_exception.h"
00014 #include "exceptions/duplicate_attribute_exception.h"
00015 #include "exceptions/no_more_records_exception.h"
00016 #include "exceptions/end_of_file_exception.h"
00017 #include "exceptions/file_not_found_exception.h"
00018 #include "exceptions/record_not_found_exception.h"
00019 #include "exceptions/duplicate_insertion_exception.h"
00020 #include "exceptions/directory_overflow_exception.h"
00021 
00022 #define MAX(a,b) ((a) > (b) ? (a) : (b))
00023 
00024 namespace badgerdb {
00025 
00026 // Constructor for the Index class. The arguments passed in are 
00027 //     'name' -- file name of the relation
00028 //     'offset', 'length', 'type' -- describing the attribute indexed
00029 //     'unique' -- flag for enforcing uniquess of the entries
00030 
00031 Index::Index(const std::string &name, int offset, int length, Datatype type, int unique, BufMgr *bufferMgr)
00032 {
00033   bufMgr = bufferMgr;
00034   Page *pagePtr;
00035 
00036   // For this assignment turn off hash indices on strings
00037   if(type == STRING) 
00038   {
00039     throw BadParamException();
00040   }
00041    
00042   curBuc = NULL;
00043 
00044   recSize = length + sizeof(RecordId);     // size of the index entry
00045   numSlots = (Page::SIZE - 2*sizeof(short)) / recSize;   // # entries on a page
00046 
00047   // get name of the index file by concatenating relation name and
00048   // the offset of the attribute
00049 
00050   std::ostringstream outputString;
00051   outputString << name << '.' << offset << std::endl;
00052   std::string indexName = outputString.str(); 
00053 
00054   // The constructor runs in two cases. In the first case, an index 
00055   // doesn't exist as detected by db.openFile. The relation file 
00056   // should be scanned and each tuple is inserted to the index. In
00057   // the second case, an index already exists. Only the headerPage
00058   // needs to read in and pinned down in the buffer pool.
00059 
00060   try
00061   {
00062     file = new BlobFile(indexName, false/*open*/);  //open file
00063 
00064     headerPageNo = 1;
00065     bufMgr->readPage(file, headerPageNo, pagePtr);
00066     headerPage = (iHeaderPage*) pagePtr;
00067     dirSize = (int)pow(2, headerPage->depth);
00068     return;
00069   }
00070   catch(FileNotFoundException e)
00071   {
00072   }
00073 
00074   file = new BlobFile(indexName, true/*create*/); //create file
00075 
00076   // allocate and initialize the header page
00077   bufMgr->allocPage(file, headerPageNo, (Page*&)headerPage);
00078   strcpy(headerPage->fileName, name.c_str());
00079 
00080   headerPage->offset = offset;
00081   headerPage->length = length;
00082   headerPage->type = type;
00083   headerPage->depth = 0;
00084   headerPage->unique = unique;
00085   dirSize = 1;
00086 
00087   // allocate and initialize the first bucket pointed by dir[0]
00088   PageId pageNo;
00089   Bucket* bucket;
00090   bufMgr->allocPage(file, pageNo, (Page*&)bucket);
00091 
00092   bucket->depth = 0;
00093   bucket->slotCnt = 0;
00094   headerPage->dir[0] = pageNo;
00095 
00096   bufMgr->unPinPage(file, pageNo, true);
00097 
00098   // build index by scanning the relation file and inserting every
00099   // tuple
00100 
00101   FileScan fileScan(name, bufferMgr);
00102 
00103   fileScan.startScan(offset, length, type, std::string(""), EQ);
00104 
00105   RecordId   rid;
00106 
00107   while(1)
00108   {
00109     try
00110     {
00111       fileScan.scanNext(rid);
00112       std::string recordString = fileScan.getRecord();
00113       insertEntry(recordString.c_str() + offset, rid);
00114     }
00115     catch(EndOfFileException &e)
00116     {
00117       break;
00118     }
00119   }
00120 }
00121 
00122 Index::~Index() {
00123 
00124   bufMgr->unPinPage(file, headerPageNo, true);
00125 
00126   if (curBuc != NULL)
00127   {
00128     bufMgr->unPinPage(file, curPageNo, true);
00129   }
00130   
00131   bufMgr->flushFile(file);
00132 
00133   delete file;
00134 }
00135 
00136 // return a hashvalue hashed from the attribute 'value' according to 
00137 // the current depth of the directory
00138 
00139 void Index::hashIndex(const void *attr, int &hashvalue)
00140 {
00141   int index;
00142   const char *value = (const char *)attr;
00143 
00144   switch (headerPage->type)
00145   {
00146     case STRING:
00147       {
00148         // For strings "add" the values of the characters, stopping 
00149         // at the first character that is 0
00150         index = value[0];
00151         for (int i = 1; i < headerPage->length; i++)
00152         {
00153           if (!value[i]) 
00154             index += value[i];
00155           else
00156             break;
00157         }
00158       }
00159       break;
00160 
00161     case DOUBLE:
00162       {
00163         // Treat doubles as strings of length sizeof (double)
00164         index = value[0];
00165         for (int i = 1; i < headerPage->length; i++) index += value[i];
00166       }
00167       break;
00168 
00169     case INTEGER:
00170       // copy the integer value to avoid byte-alignment errors.
00171       memcpy (&index, value, sizeof(int));
00172       break;
00173   }
00174 
00175   // hashvalue is the lower bits of 'index'. Number of bits is determined
00176   // by the depth of the directory
00177 
00178   index = std::abs(index);   
00179   int mask = 1 << headerPage->depth;
00180   hashvalue = index % mask;
00181 }
00182 
00183 // Insert an <attribute, rid> pair into the index. Return OK if the 
00184 // entry is inserted and DIROVERFLOW if the directory isn't large
00185 // enough to hold the indices.
00186 
00187 void Index::insertEntry(const void *value, RecordId rid) 
00188 { 
00189   Bucket *bucket;
00190   Bucket *newBucket;
00191   PageId newPageNo;       
00192   char  data[Page::SIZE*2];
00193   int counter;
00194   int index;
00195 
00196   // If the 'unique' flag is set, scan the index to see if the
00197   // <attribute, rid> pair already exists
00198 
00199   if (headerPage->unique == UNIQUE)
00200   {
00201     RecordId outRid;
00202 
00203     startScan(value);
00204 
00205     while(1)
00206     {
00207       try
00208       {
00209         scanNext(outRid);
00210         if(!memcmp(&outRid, &rid, sizeof(RecordId)))
00211           throw DuplicateInsertionException();
00212       }
00213       catch (NoMoreRecordsException &e)
00214       {
00215         break;
00216       }
00217     }
00218     
00219     endScan();
00220   }
00221  
00222   // Get the bucket containing the entry into buffer pool
00223   hashIndex(value, index);
00224   PageId pageNo = headerPage->dir[index];
00225 
00226 #ifdef DEBUGIND
00227   std::cout << "Inserting entry " << *(int*)value << " to bucket " 
00228     << pageNo << std::endl;
00229 #endif //DEBUGIND
00230  
00231   bufMgr->readPage(file, pageNo, (Page*&)bucket);
00232 
00233   // the bucket needs to be splitted if the number of entries
00234   // on the bucket equals the maximum
00235   if (bucket->slotCnt == numSlots) { // splitting bucket
00236     
00237     // allocate a new bucket
00238     bufMgr->allocPage(file, newPageNo, (Page*&)newBucket);  
00239 
00240     // Initialize this newly allocated bucket
00241     newBucket->depth = ++(bucket->depth);
00242     newBucket->slotCnt = 0;
00243 
00244     // Copy all (value, rid) pairs in the old bucket and the new
00245     // entry to a temporary area
00246     memcpy(data, bucket->data, numSlots*recSize);
00247     memcpy(&(data[numSlots*recSize]), value, headerPage->length);
00248     memcpy(&(data[numSlots*recSize + headerPage->length]), &rid, sizeof(RecordId));
00249 
00250     counter = bucket->slotCnt + 1;
00251     bucket->slotCnt = 0;
00252 
00253     // the directory needs to be doubled if the depth of the bucket
00254     // being splitted equals the depth of the directory
00255 
00256     if (bucket->depth > headerPage->depth)
00257     { // doubling directory
00258 
00259       // The directory is doubled and the lower half of the directory
00260       // is copied to the upper half
00261 
00262       int newDirSize = 2 * dirSize;
00263       if (newDirSize > DIRSIZE)
00264         throw DirectoryOverflowException();
00265 
00266       for(int i = 0; i < dirSize; i++)
00267       {
00268         headerPage->dir[i + dirSize] = headerPage->dir[i];
00269       }
00270 
00271       dirSize = newDirSize;
00272       (headerPage->depth)++;
00273       headerPage->dir[index + (1 << (bucket->depth - 1))] = newPageNo;
00274     }
00275     else
00276     {
00277       // reset the appropriate directories to the new bucket
00278       int oldindex = index % (1 << (bucket->depth - 1));
00279       int newindex = oldindex + (1 << (bucket->depth - 1));
00280       for (int j = 0; j < dirSize; j++)
00281       {
00282         if ((j % (1 << (bucket->depth))) == newindex)
00283         {
00284           headerPage->dir[j] = newPageNo;
00285         }
00286       }
00287     }
00288 
00289 #ifdef DEBUGIND
00290     printDir();
00291 #endif //DEBUGIND
00292 
00293     // call insertEntry recursively to insert all (value, rid) 
00294     // pairs in the temporary area to the index
00295 
00296     for (int k = 0; k < counter; k++) {
00297       value = &(data[k * recSize]);
00298       rid = * ((RecordId *)((char *)value + headerPage->length));
00299       insertEntry(value, rid);
00300     }
00301 
00302     bufMgr->unPinPage(file, newPageNo, true);
00303 
00304   }
00305   else
00306   {
00307     // There is sufficient free space in the bucket. Insert (value, rid) here
00308 
00309     int offset = (bucket->slotCnt) * recSize;
00310     memcpy(&(bucket->data[offset]), value, headerPage->length);
00311     memcpy(&(bucket->data[offset+headerPage->length]), &rid, sizeof(RecordId));
00312     (bucket->slotCnt)++;
00313   }
00314 
00315   bufMgr->unPinPage(file, pageNo, true);
00316 }
00317 
00318 void Index::printDir() {
00319   std::cout << "printing directory...\n";
00320   std::cout << "depth is " << headerPage->depth << std::endl;
00321   for (int i = 0; i < dirSize; i++) 
00322     std::cout << i << "\tpoints to bucket " << headerPage->dir[i] << std::endl;
00323   std::cout << std::endl;
00324 }
00325 
00326 // delete <attribute, rid> pair from the index. return OK if deleted
00327 // return RECNOTFOUND if such a pair does not exist in the index
00328 
00329 void Index::deleteEntry(const void* value, const RecordId &rid)
00330 {
00331   int index;
00332   Bucket* bucket;
00333 
00334   hashIndex(value, index);
00335 
00336   // read in the bucket that might have the entry in it
00337   PageId pageNo = headerPage->dir[index];
00338   bufMgr->readPage(file, pageNo, (Page*&)bucket);
00339 
00340   // scan the bucket for the entry. Delete it if found
00341   for(int i = 0; i < bucket->slotCnt; i++)
00342   {
00343     try
00344     {
00345       matchRec(bucket, value, i);
00346       if(!memcmp(&rid, &(bucket->data[i*recSize + headerPage->length]), sizeof(RecordId)))
00347       {
00348         // the entry is found. Decrease the entry counts in the bucket
00349         // and copy the last entry in the bucket to the slot occupied
00350         // by the deleted entry
00351 
00352         (bucket->slotCnt)--;
00353         memcpy(&(bucket->data[i*recSize]), &(bucket->data[recSize*(bucket->slotCnt)]), recSize);
00354         bufMgr->unPinPage(file, pageNo, true);
00355         return;
00356       }
00357     }
00358     catch(RecordNotFoundException &e)
00359     {
00360     }
00361   }
00362 
00363   bufMgr->unPinPage(file, pageNo, false);
00364 
00365   throw RecordNotFoundException();
00366 }
00367 
00368 // Test if the index entry in 'bucket' matches the attribute 'value.
00369 // 'offset' indicates the location of the entry in the bucket
00370 
00371 void Index::matchRec(const Bucket* bucket, const void* value, int offset) 
00372 {
00373   const char* tmp = &(bucket->data[offset*recSize]);
00374 
00375   switch(headerPage->type)
00376   {
00377   case INTEGER:
00378     if (*(int *)value == *(int *)tmp)
00379     //if (atoi((char *)value) == *(int *)tmp)
00380       return;
00381     break;
00382 
00383   case DOUBLE:
00384     {
00385        // Take care of byte misallignments 
00386        double inVal, inTmp;
00387        memcpy (&inVal, value, sizeof (double));
00388        memcpy (&inTmp, tmp, sizeof (double));
00389        // if (*(double *)value == *(double*)tmp) return OK;
00390        if (inVal == inTmp) return;
00391        //if (atof((char *)value) == inTmp) return;
00392     }
00393     break;
00394 
00395   case STRING:
00396     if (!memcmp(value, tmp, headerPage->length))
00397       return;
00398     break;
00399   }
00400 
00401   throw RecordNotFoundException();
00402 }
00403 
00404 void Index::printBucs()
00405 {
00406 
00407   Bucket* bucket;
00408 
00409   std::cerr << std::endl << "printing indices"<< std::endl;
00410 
00411   for (int i = 0; i < dirSize; i++)
00412   {
00413     PageId pageNo = headerPage->dir[i];
00414     std::cerr << "page " << pageNo << ": ";
00415 
00416     bufMgr->readPage(file, pageNo, (Page*&)bucket);
00417     for (int j = 0; j < bucket->slotCnt; j++)
00418     {
00419       switch(headerPage->type)
00420       {
00421         case INTEGER:
00422           std::cerr << *(int*)&(bucket->data[j*recSize]) << "\t";
00423           break;
00424         case DOUBLE:
00425           std::cerr << *(double*)&(bucket->data[j*recSize]) << "\t";
00426           break;
00427         default:break;
00428       }
00429     }
00430     std::cerr << std::endl;
00431     bufMgr->unPinPage(file, pageNo, false);
00432   }
00433 }
00434 
00435 // start a scan of the entries with attribute 'value'. return SCANTABFULL
00436 // if too many scans are open at the same time
00437 
00438 void Index::startScan(const void* scanValue)
00439 {
00440   int hashvalue;
00441 
00442   curValue = scanValue;
00443 
00444   // read in and pin down the bucket containing entries with 'value'
00445   // in the buffer pool
00446   hashIndex(curValue, hashvalue);
00447 
00448   PageId pageNo = curPageNo = headerPage->dir[hashvalue];
00449   bufMgr->readPage(file, pageNo, (Page*&)curBuc);
00450 
00451   curOffset = 0;
00452 }
00453 
00454 // return the next entry with attribute 'value'. return NOMORERECS if
00455 // it reaches the end of the bucket
00456 
00457 void Index::scanNext(RecordId& outRid) 
00458 {
00459   int &offset = curOffset;
00460   Bucket* buc = curBuc;
00461   const void* value = curValue;
00462 
00463   while(offset < buc->slotCnt)
00464   {
00465     try
00466     {
00467       matchRec(buc, value, offset);
00468       break;
00469     }
00470     catch(RecordNotFoundException &e)
00471     {
00472     }
00473 
00474     offset++;
00475   }
00476 
00477   if (offset == buc->slotCnt)
00478     throw NoMoreRecordsException();
00479   else
00480   {
00481     outRid = *(RecordId *)&(buc->data[offset*recSize + headerPage->length]);
00482     offset++;
00483   }
00484 }
00485 
00486 // unpin the bucket and clear the scan table entry.
00487 void Index::endScan() 
00488 {
00489   if (curBuc != NULL)
00490   {
00491     bufMgr->unPinPage(file, curPageNo, false);
00492     curBuc = NULL;
00493   }
00494 }
00495 
00496 /*
00497 // find a free entry in the scan table
00498 int Index::findFree()
00499 {
00500   int i;
00501   for(i = 0; i < MAXOPENSCANS; i++)
00502   {
00503     if (openScans[i].valid == false) return i;
00504   }
00505   return -1;
00506 }
00507 */
00508 
00509 }