Shore Storage Manager: src/fc/mem

00001 /* -*- mode:C++; c-basic-offset:4 -*-
00002      Shore-MT -- Multi-threaded port of the SHORE storage manager
00003    
00004                        Copyright (c) 2007-2009
00005       Data Intensive Applications and Systems Labaratory (DIAS)
00006                Ecole Polytechnique Federale de Lausanne
00007    
00008                          All Rights Reserved.
00009    
00010    Permission to use, copy, modify and distribute this software and
00011    its documentation is hereby granted, provided that both the
00012    copyright notice and this permission notice appear in all copies of
00013    the software, derivative works or modified versions, and any
00014    portions thereof, and that both notices appear in supporting
00015    documentation.
00016    
00017    This code is distributed in the hope that it will be useful, but
00018    WITHOUT ANY WARRANTY; without even the implied warranty of
00019    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS
00020    DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
00021    RESULTING FROM THE USE OF THIS SOFTWARE.
00022 */
00023 /*<std-header orig-src='shore' incl-file-exclusion='MEM_BLOCK_CPP'>
00024 
00025  $Id: mem_block.cpp,v 1.3 2010/07/07 21:43:45 nhall Exp $
00026 
00027 SHORE -- Scalable Heterogeneous Object REpository
00028 
00029 Copyright (c) 1994-99 Computer Sciences Department, University of
00030                       Wisconsin -- Madison
00031 All Rights Reserved.
00032 
00033 Permission to use, copy, modify and distribute this software and its
00034 documentation is hereby granted, provided that both the copyright
00035 notice and this permission notice appear in all copies of the
00036 software, derivative works or modified versions, and any portions
00037 thereof, and that both notices appear in supporting documentation.
00038 
00039 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY
00040 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS
00041 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND
00042 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
00043 
00044 This software was developed with support by the Advanced Research
00045 Project Agency, ARPA order number 018 (formerly 8230), monitored by
00046 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518.
00047 Further funding for this work was provided by DARPA through
00048 Rome Research Laboratory Contract No. F30602-97-2-0247.
00049 
00050 */
00051 
00052 /**\cond skip */
00053 
00054 #include "w.h"
00055 #include "mem_block.h"
00056 #include "atomic_ops.h"
00057 #include <cstdlib>
00058 #include <stdio.h>
00059 #include <algorithm>
00060 #ifdef __linux
00061 #include <malloc.h>
00062 #endif
00063 
00064 // #include <cassert>
00065 #undef assert
00066 void assert_failed(const char *desc, const char *f, int l) {
00067     fprintf(stdout, "Assertion failed: %s at line %d file %s ", desc, l,f);
00068     w_assert0(0);
00069 }
00070 #define assert(x)   if (!(x)) assert_failed(#x, __FILE__, __LINE__);
00071 
00072 #define TEMPLATE_ARGS chip_size, chip_count, block_size
00073 
00074 namespace memory_block {
00075 #if 0
00076 } // keep emacs happy...
00077 #endif
00078 
00079 // adapted from http://chessprogramming.wikispaces.com/Population+Count#SWAR-Popcount
00080 typedef unsigned long long u64;
00081 static inline
00082 long popc64(u64 x) {
00083     u64 k1 = 0x5555555555555555ull;
00084     u64 k2 = 0x3333333333333333ull;
00085     u64 k4 = 0x0f0f0f0f0f0f0f0full;
00086     u64 kf = 0x0101010101010101ull;
00087     x =  x       - ((x >> 1)  & k1); //put count of each 2 bits into those 2 bits
00088     x = (x & k2) + ((x >> 2)  & k2); //put count of each 4 bits into those 4 bits
00089     x = (x       +  (x >> 4)) & k4 ; //put count of each 8 bits into those 8 bits
00090     x = (x * kf) >> 56; //returns 8 most significant bits of x + (x<<8) + (x<<16) + (x<<24) + ...
00091     return x;
00092 }
00093 
00094 size_t        block_bits::_popc(bitmap bm) {
00095 #ifdef __GNUC__
00096     
00097 #if defined(__x86_64) || defined(i386) || defined(__i386__)
00098 // #warning "Using __builtin_popcountll"
00099     return __builtin_popcountll(bm);
00100     
00101 #elif defined(__sparcv9)
00102 #warning "Using gcc inline asm to access sparcv9 'popc' instruction"
00103     long rval;
00104     __asm__("popc    %[in], %[out]" : [out] "=r"(rval) : [in] "r"(x));
00105     return rval;
00106 #else
00107 #warning "using home-brew popc routine"
00108     return popc64(bm);
00109 #endif
00110     
00111 #else // !defined(__GNUC__)
00112 #warning "using home-brew popc routine"
00113     return popc64(bm);
00114 #endif
00115 }
00116 
00117 block_bits::block_bits(size_t chip_count)
00118     : _usable_chips(create_mask(chip_count))
00119     , _zombie_chips(0)
00120 {
00121     assert(chip_count <= 8*sizeof(bitmap));
00122 }
00123 
00124 size_t block_bits::acquire(size_t chip_count) {
00125     (void) chip_count; // make gcc happy...
00126     
00127     /* find the rightmost set bit.
00128 
00129        If the map is smaller than the word size, but logically full we
00130        will compute an index equal to the capacity. If the map is
00131        physically full (_available == 0) then we'll still compute an
00132        index equal to capacity because 0-1 will underflow to all set
00133        bits. Therefore the check for full is the same whether the
00134        bitmap can use all its bits or not.
00135      */
00136     bitmap one_bit = _usable_chips &- _usable_chips;
00137     size_t index = _popc(one_bit-1);
00138     if(index < 8*sizeof(bitmap)) {
00139         // common case: we have space
00140         assert(index < chip_count);
00141         _usable_chips ^= one_bit;
00142     }
00143     else {
00144         // oops... full
00145         assert(index == 8*sizeof(bitmap));
00146     }
00147 
00148     return index;
00149 }
00150 
00151 void block_bits::release(size_t index, size_t chip_count) {
00152     // assign this chip to the zombie set for later recycling
00153     (void) chip_count; // keep gcc happy
00154     assert(index < chip_count);
00155     bitmap to_free = bitmap(1) << index;
00156     assert(! (to_free & *usable_chips()));
00157 #if I_WISH
00158     bitmap was_free = __sync_fetch_and_or(&_zombie_chips, to_free);
00159 #else
00160     membar_exit();
00161     bitmap volatile* ptr = &_zombie_chips;
00162     bitmap ov = *ptr;
00163     while(1) {
00164         bitmap nv = ov | to_free;
00165         bitmap cv = atomic_cas_64(ptr, ov, nv);
00166         if(cv == ov)
00167             break;
00168         ov = cv;
00169     }
00170     bitmap was_free = ov;
00171 #endif
00172     (void) was_free; // keep gcc happy
00173     assert(! (was_free & to_free));
00174 }
00175 
00176 block_bits::bitmap block_bits::create_mask(size_t bits_set) {
00177     // doing it this way allows us to create a bitmap of all ones if need be
00178     return ~bitmap(0) >> (8*sizeof(bitmap) - bits_set);
00179 }
00180 
00181 void block_bits::recycle() {
00182     /* recycle the block.
00183 
00184        Whatever bits have gone zombie since we last recycled become
00185        the new set of usable bits. We also XOR them atomically back
00186        into the zombie set to clear them out there. That way we don't
00187        leak bits if a releasing thread races us and adds more bits to the
00188        zombie set after we read it.
00189     */
00190     bitmap newly_usable = *&_zombie_chips;
00191     _usable_chips |= newly_usable;
00192 #if I_WISH
00193     __sync_xor_and_fetch(&_zombie_chips, newly_usable);
00194 #else
00195     membar_exit();
00196     bitmap volatile* ptr = &_zombie_chips;
00197     bitmap ov = *ptr;
00198     while(1) {
00199         bitmap nv = ov ^ newly_usable;
00200         bitmap cv = atomic_cas_64(ptr, ov, nv);
00201         if(cv == ov)
00202             break;
00203         ov = cv;
00204     }
00205 #endif
00206 }
00207 
00208 void* block::acquire(size_t chip_size, size_t chip_count, size_t /*block_size*/) {
00209     size_t index = _bits.acquire(chip_count);
00210     return (index < chip_count)? _get(index, chip_size) : 0;
00211 }
00212 
00213 void block::release(void* ptr, size_t chip_size, size_t chip_count, size_t block_size)
00214 {
00215     /* use pointer arithmetic to find the beginning of our block,
00216        where the block* lives.
00217 
00218        Our caller is responsible to be sure this address actually
00219        falls inside a memory block
00220      */
00221     union { void* v; size_t n; block* b; char* c; } u = {ptr}, v=u;
00222     u.n &= -block_size;
00223     size_t offset = v.c - u.b->_data;
00224     size_t idx = offset/chip_size;
00225     assert(u.b->_data + idx*chip_size == ptr);
00226     u.b->_bits.release(idx, chip_count);
00227 }
00228 
00229 char* block::_get(size_t index, size_t chip_size) {
00230     return _data + index*chip_size;
00231 }
00232 
00233 block::block(size_t chip_size, size_t chip_count, size_t block_size)
00234     : _bits(chip_count)
00235     , _owner(0)
00236     , _next(0)
00237 {
00238     // make sure all the chips actually fit in this block
00239     char* end_of_block = _get(0, chip_size)-sizeof(this)+block_size;
00240     char* end_of_chips = _get(chip_count, chip_size);
00241     (void) end_of_block; // keep gcc happy
00242     (void) end_of_chips; // keep gcc happy
00243     assert(end_of_chips <= end_of_block);
00244     
00245     /* We purposefully don't check alignment here because some parts
00246        of the impl cheat for blocks which will never be used to
00247        allocate anything (the fake_block being the main culprit).
00248        The block_pool does check alignment, though.
00249      */
00250 }
00251 
00252 void* block_list::acquire(size_t chip_size, size_t chip_count, size_t block_size)
00253 {
00254     if(void* ptr = _tail->acquire(TEMPLATE_ARGS)) {
00255         _hit_count++;
00256         return ptr;
00257     }
00258 
00259     // darn... gotta do it the hard way
00260     return _slow_acquire(TEMPLATE_ARGS);
00261 }
00262 
00263 
00264 block_list::block_list(block_pool* pool, size_t chip_size, size_t chip_count, size_t block_size)
00265     : _fake_block(TEMPLATE_ARGS)
00266     , _tail(&_fake_block)
00267     , _pool(pool)
00268     , _hit_count(0)
00269     , _avg_hit_rate(0)
00270 {
00271     /* make the fake block advertize that it has nothing to give
00272 
00273        The first time the user tries to /acquire/ the fast case will
00274        detect that the fake block is "full" and fall back to the slow
00275        case. The slow case knows about the fake block and will remove
00276        it from the list.
00277 
00278        This trick lets us minimize the number of branches required by
00279        the fast path acquire.
00280      */
00281     _fake_block._bits._usable_chips = 0;
00282     _fake_block._bits._zombie_chips = 0;
00283 }
00284 
00285 
00286 void* block_list::_slow_acquire(size_t chip_size, size_t chip_count, size_t block_size)
00287 {
00288     _change_blocks(TEMPLATE_ARGS);
00289     return acquire(TEMPLATE_ARGS);
00290 }
00291 
00292 block* block_list::acquire_block(size_t block_size)
00293 {
00294     union { block* b; uintptr_t n; } u = {_pool->acquire_block(this)};
00295     (void) block_size; // keep gcc happy
00296     assert((u.n & -block_size) == u.n);
00297     return u.b;
00298     
00299 }
00300 void block_list::_change_blocks(size_t chip_size, size_t chip_count, size_t block_size)
00301 {
00302     (void) chip_size; // keep gcc happy
00303 
00304     // first time through?
00305     if(_tail == &_fake_block) {
00306     _tail = acquire_block(block_size);
00307     _tail->_next = _tail;
00308     return;
00309     }
00310     
00311     /* Check whether we're chewing through our blocks too fast for the
00312        current ring size
00313 
00314        If we consistently recycle blocks while they're still more than
00315        half full then we need a bigger ring so old blocks have more
00316        time to cool off between reuses.
00317        
00318        To respond to end-of-spike gracefully, we're going to be pretty
00319        aggressive about returning blocks to the global pool: when
00320        recycling blocks we check for runs of blocks which do not meet
00321        some minimum utilization threshold, discarding all but one of
00322        them (keep the last for buffering purposes).
00323     */
00324     static double const    decay_rate = 1./5; // consider (roughly) the last 5 blocks
00325     // too few around suggests we should unload some extra blocks 
00326     size_t const max_available = chip_count - std::max((int)(.1*chip_count), 1);
00327     // more than 50% in-use suggests we've got too few blocks
00328     size_t const min_allocated = (chip_count+1)/2;
00329     
00330     _avg_hit_rate = _hit_count*(1-decay_rate) + _avg_hit_rate*decay_rate;
00331     if(_hit_count < min_allocated && _avg_hit_rate < min_allocated) {
00332         // too fast.. grow the ring
00333         block* new_block = acquire_block(block_size);
00334         new_block->_next = _tail->_next;
00335         _tail = _tail->_next = new_block;
00336     }
00337     else {
00338         // compress the run, if any
00339         block* prev = 0;
00340         block* cur = _tail;
00341         block* next;
00342         while(1) {
00343             next = cur->_next;
00344             next->recycle();
00345             if(next->_bits.usable_count() <= max_available)
00346             break;
00347             
00348             // compression possible?
00349             if(prev) {
00350             assert(prev != cur);
00351             assert(cur->_bits.usable_count() > max_available);
00352             assert(next->_bits.usable_count() > max_available);
00353             prev->_next = next;
00354             _pool->release_block(cur);
00355             cur = prev; // reset
00356             }
00357 
00358             // avoid the endless loop
00359             if(next == _tail)
00360             break;
00361             
00362             prev = cur;
00363             cur = next;
00364         }
00365 
00366         // recycle, repair the ring, and advance
00367         _tail = cur;
00368     }
00369 
00370     _hit_count = 0;
00371 }
00372 
00373 block_list::~block_list() {
00374     // don't free the fake block if we went unused!
00375     if(_tail == &_fake_block) return;
00376     
00377     // break the cycle so the loop terminates
00378     block* cur = _tail->_next;
00379     _tail->_next = 0;
00380 
00381     // release blocks until we hit the NULL
00382     while( (cur=_pool->release_block(cur)) );
00383 }
00384 
00385 /**\endcond skip */
00386 } // namespace memory_block