/******************************************************************************
** FILE: cache.c
** Cache and memory simulator.
*/

#include <assert.h>
#include <signal.h>
#include <ucontext.h>
#include <sys/types.h>

#include "internal.h"
#include "fastsim.h"
#include "param.h"

/* These are the functions that are called from Facile */

extern void xcache_init();
extern void xcache_flush();

extern int xcache_load_start(ulong_t inum, ulong_t va,
			     int first, short *delay);
extern int xcache_load_continue(ulong_t inum, short *delay);

extern int xcache_store_start(ulong_t inum, ulong_t va,
			      int is_spec, short *delay);
extern int xcache_store_commit(ulong inum, short *delay);
extern void xcache_store_rollback(ulong inum);

extern void xcache_retire(ulong N);

/******************************************************************************
 * Data types used by the cache simulator:
 */

typedef u_longlong_t alarm_t;

typedef enum EventTag {
    EVENT_LOAD_L1,
    EVENT_LOAD_L2,
    EVENT_LOAD_MEM,
    EVENT_STORE,
} EventTag_t;

typedef struct CacheEvent {
    struct CacheEvent *next_alloc;
    struct CacheEvent *prev_alloc;
    struct CacheEvent *next_free;
    EventTag_t kind;
    alarm_t alarm;
    void* data;
} CacheEvent_t, *CacheEvent_p;

static CacheEvent_t eventQ;
static CacheEvent_t delayedQ;
static CacheEvent_p free_events;

#define MAX_NUM_EVENTS 512
static CacheEvent_t _events[MAX_NUM_EVENTS];

static CacheEvent_p event_top();
static CacheEvent_p event_alloc(alarm_t);
static void event_free(CacheEvent_p);

static CacheEvent_p delayed_event_alloc(alarm_t);
static void add_delayed_events();

typedef struct MSHR {
    struct MSHR *next_free;	/* next mshr in free list */
    ulong_t address;		/* cache line aligned load address */
    alarm_t alarm;		/* time of next relevant event */
    struct MSHR *mshr2;		/* point from l1-mshr to l2-mshr */
    ushort_t istart;		/* value of istate start (when mshr alloced) */
    short inum;			/* inum of load (when mshr alloced) */
    uchar_t wait;		/* TRUE if waiting for a store to finish */
    uchar_t ref_count;		/* ref-count for garbage collection */
    uchar_t is_free;		/* flag if this mshr not allocated */
#ifdef STATS
    uchar_t num_coalesced;	/* number of loads coalsced into this MSHR */
#endif
} MSHR_t, *MSHR_p;

static MSHR_t mshr_list[NUM_MSHRs];
static MSHR_p mshr_free_list;
static ulong_t mshr_num_free;
static void mshr_free(MSHR_p);

static MSHR_t mshr2_list[NUM_MSHR2s];
static MSHR_p mshr2_free_list;
static ulong_t mshr2_num_free;
static void mshr2_free(MSHR_p);

typedef struct WBuf {
    struct WBuf *next_free;	/* next wbuf in free list */
    ulong_t address;		/* wbuf aligned store adderess */
    alarm_t alarm;		/* time to next relevant event */
    ushort_t istart;		/* value of istate start (when mshr alloced) */
    short inum;			/* inum of load (when mshr alloced) */
    uchar_t is_free;		/* flag if this wbuf not allocated */
} WBuf_t, *WBuf_p;

static WBuf_t wbuf_list[NUM_WBUFs];
static WBuf_p wbuf_free_list;
static ulong_t wbuf_num_free;
static void wbuf_free(WBuf_p);

static void finish_free();	/* finish MSHR & WBuf free operations */

typedef struct CacheEntry {
    ulong_t address;		/* cache line alligned address */
    uchar_t is_free;		/* flag if this entry not allocated */
    uchar_t touch;		/* flag when touched (for LRU algorithm) */
} CacheEntry_t, *CacheEntry_p;

#define L1_BANK_SIZE (L1_CACHE_SIZE/L1_CACHE_ASSOC)
#define L1_BANK_LINES (L1_BANK_SIZE/L1_LINE_WIDTH)
static CacheEntry_t l1_cache[L1_BANK_LINES][L1_CACHE_ASSOC];
static uchar_t l1_clock[L1_BANK_LINES];

#define L2_BANK_SIZE (L2_CACHE_SIZE/L2_CACHE_ASSOC)
#define L2_BANK_LINES (L2_BANK_SIZE/L2_LINE_WIDTH)
static CacheEntry_t l2_cache[L2_BANK_LINES][L2_CACHE_ASSOC];
static uchar_t l2_clock[L2_BANK_LINES];

typedef union {
    MSHR_p mshr;
    WBuf_p wbuf;
} IState_t;

void mem_test(ulong_t va);
static int testing_va = 0;
static ulong_t va_to_test;
static int va_test_failed;

/******************************************************************************
 * Static functions and global data:
 */

/* Current cycle of the pipeline simulation. (defined in Facile) */
extern u_longlong_t xCPU_cycle;

/* Last simulation cycle reached by the cache simulator. */
static u_longlong_t cache_cycle = 0;

#define NUM_ISTATES 0x100
static ulong_t istate_start = 0;	/* start index for istate array */
static IState_t istate[NUM_ISTATES];	/* interface state array */

/* Compute the delay from the current cycle to a given alarm cycle. */
static inline short
alarm_to_delay(alarm_t alarm)
{ return alarm - xCPU_cycle; }

static inline alarm_t
mshr_alarm(MSHR_p mshr)
{
    MSHR_p mshr2 = mshr->mshr2;
    if(mshr2) return mshr2->alarm;
    else return mshr->alarm;
}

static inline int
_buf_inum(int inum, ulong_t istart)
{
    int ret = (inum + istart) % NUM_ISTATES - istate_start;
    if(ret > NUM_ISTATES/2) return ret - NUM_ISTATES;
    if(ret < -(NUM_ISTATES/2)) return ret + NUM_ISTATES;
    return ret;
}

#define BUF_INUM(buf) (_buf_inum((buf)->inum,(buf)->istart))

static MSHR_p cache_load_start(ulong_t,int,short*,int);
static int cache_load_continue(MSHR_p,short*);
static WBuf_p cache_store_start(ulong_t,int,short*,int);
static int cache_commit(WBuf_p,short*);
static void cache_rollback(WBuf_p);

static void load_l1(MSHR_p);		/* waits for conflicting stores */
static void load_l2(MSHR_p);		/* process a L2 cache lookup */

static int lookup_l1(ulong_t);
static void update_l1(ulong_t);
static void flush_line_l1(ulong_t);

static int lookup_l2(ulong_t);
static void update_l2(ulong_t);
static void flush_line_l2(ulong_t);

/* Cache Statistics */

hrtime_t time_cache = 0;
static hrtime_t time_cache_start;

u_longlong_t count_cache_flush = 0;
u_longlong_t count_cache_load_start = 0;
u_longlong_t count_cache_no_mshr = 0;
u_longlong_t count_cache_load_continue = 0;
u_longlong_t count_cache_load_hit_l1 = 0;
u_longlong_t count_cache_load_hit_l2 = 0;
u_longlong_t count_cache_load_memory = 0;
u_longlong_t count_cache_load_removed = 0;
u_longlong_t count_cache_store_start = 0;
u_longlong_t count_cache_store_spec = 0;
u_longlong_t count_cache_no_wbuf = 0;
u_longlong_t count_cache_no_wbuf_spec = 0;
u_longlong_t count_cache_store_commit = 0;
u_longlong_t count_cache_store_commit_fail = 0;
u_longlong_t count_cache_store_rollback = 0;
u_longlong_t count_cache_segfault = 0;

/******************************************************************************
 * Cache simulator interface functions: These functions hide the cache
 * simulator's internal data structures (i.e., MSHR_t and WBuf_t) from
 * Facile.
 */

void
xcache_init()
{
    ulong_t ii, jj;

    for(ii=0; ii<MAX_NUM_EVENTS-1; ++ii)
	_events[ii].next_free = &_events[ii+1];
    _events[MAX_NUM_EVENTS-1].next_free = NULL;
    free_events = _events;

    eventQ.next_alloc = &eventQ;
    eventQ.prev_alloc = &eventQ;
    eventQ.alarm = 0;

    delayedQ.next_alloc = &delayedQ;
    delayedQ.prev_alloc = &delayedQ;
    delayedQ.alarm = 0;

    for(ii=0; ii<NUM_MSHRs; ++ii) {
	mshr_list[ii].next_free = &mshr_list[ii+1];
	mshr_list[ii].is_free = 1;
    }
    mshr_list[NUM_MSHRs-1].next_free = NULL;
    mshr_free_list = mshr_list;
    mshr_num_free = NUM_MSHRs;

    for(ii=0; ii<NUM_MSHR2s; ++ii) {
	mshr2_list[ii].next_free = &mshr2_list[ii+1];
	mshr2_list[ii].is_free = 1;
    }
    mshr2_list[NUM_MSHR2s-1].next_free = NULL;
    mshr2_free_list = mshr2_list;
    mshr2_num_free = NUM_MSHR2s;

    for(ii=0; ii<NUM_WBUFs; ++ii) {
	wbuf_list[ii].next_free = &wbuf_list[ii+1];
	wbuf_list[ii].is_free = 1;
    }
    wbuf_list[NUM_WBUFs-1].next_free = NULL;
    wbuf_free_list = wbuf_list;
    wbuf_num_free = NUM_WBUFs;

    for(ii=0; ii<L1_BANK_LINES; ++ii) {
	for(jj=0; jj<L1_CACHE_ASSOC; ++jj)
	    l1_cache[ii][jj].is_free = 1;
	l1_clock[ii] = 0;
    }

    for(ii=0; ii<L2_BANK_LINES; ++ii) {
	for(jj=0; jj<L2_CACHE_ASSOC; ++jj)
	    l2_cache[ii][jj].is_free = 1;
	l2_clock[ii] = 0;
    }

    istate_start = 0;
}

void
xcache_flush()
{
#ifdef STATS
    ++count_cache_flush;
#endif
}

int
xcache_load_start(ulong_t inum, ulong_t va, int first, short *delay)
{
    ulong_t index;
    MSHR_p mshr;
    int ret;
#ifdef STATS
    ++count_cache_load_start;
    time_cache_start = gethrtime();
#endif
    index = (inum + istate_start) % NUM_ISTATES;
    mshr = cache_load_start(va,first,delay,(int)inum);
    istate[index].mshr = mshr; ret = (mshr || *delay<0) ? 0 : 1;
#ifdef STATS
    time_cache += gethrtime() - time_cache_start;
    if(*delay < 0) ++count_cache_no_mshr;
#endif
    return ret;
}

int
xcache_load_continue(ulong_t inum, short *delay)
{
    ulong_t index;
    int ret;
#ifdef STATS
    ++count_cache_load_continue;
    time_cache_start = gethrtime();
#endif
    index = (inum + istate_start) % NUM_ISTATES;
    ret = cache_load_continue(istate[index].mshr,delay);
#ifdef STATS
    time_cache += gethrtime() - time_cache_start;
#endif
    return ret;
}

int
xcache_store_start(ulong_t inum, ulong_t va, int is_spec, short *delay)
{
    ulong_t index;
    WBuf_p wbuf;
    int ret;
#ifdef STATS
    ++count_cache_store_start;
    if(is_spec) ++count_cache_store_spec;
    time_cache_start = gethrtime();
#endif
    index = (inum + istate_start) % NUM_ISTATES;
    wbuf = cache_store_start(va,is_spec,delay,(int)inum);
    istate[index].wbuf = wbuf; ret = wbuf ? 1 : 0;
#ifdef STATS
    time_cache += gethrtime() - time_cache_start;
    if(!wbuf) {
	++count_cache_no_wbuf;
	if(is_spec) ++count_cache_no_wbuf_spec;
    } else if(*delay > 0)
	++count_cache_store_commit_fail;
#endif
    return ret;
}

int
xcache_store_commit(ulong_t inum, short *delay)
{
    ulong_t index;
    int ret;
#ifdef STATS
    ++count_cache_store_commit;
    time_cache_start = gethrtime();
#endif
    index = (inum + istate_start) % NUM_ISTATES;
    ret = cache_commit(istate[index].wbuf,delay);
#ifdef STATS
    time_cache += gethrtime() - time_cache_start;
    if(!ret) ++count_cache_store_commit_fail;
#endif
    return ret;
}

void
xcache_store_rollback(ulong_t inum)
{
    ulong_t index;
#ifdef STATS
    ++count_cache_store_rollback;
    time_cache_start = gethrtime();
#endif
    index = (inum + istate_start) % NUM_ISTATES;
    cache_rollback(istate[index].wbuf);
#ifdef STATS
    time_cache += gethrtime() - time_cache_start;
#endif
}

void
xcache_retire(ulong_t N)
{
    istate_start = (istate_start + N) % NUM_ISTATES;
}

/******************************************************************************
 ******************************************************************************
 *** The rest of these routines perform the actual cache simulation!        ***
 ******************************************************************************
 *****************************************************************************/

/* Advance cache simulation to current execution time. */
static void
cache_advance()
{
    assert(xCPU_cycle >= cache_cycle);

    for(;;) {
	CacheEvent_p event = event_top();
	if(!event || event->alarm > xCPU_cycle)
	    break;

	cache_cycle = event->alarm;
	finish_free();

	switch(event->kind) {
	 case EVENT_LOAD_L1: load_l1((MSHR_p)event->data); break;
	 case EVENT_LOAD_L2: load_l2((MSHR_p)event->data); break;
	 case EVENT_LOAD_MEM: {
	     /* just waiting to finish the load */
	     MSHR_p mshr = (MSHR_p)event->data;
	     update_l1(mshr->address);
	     assert(mshr->mshr2);
	     update_l2(mshr->address);
#ifdef STATS
	     count_cache_load_memory += mshr->num_coalesced;
#endif
	     mshr_free(mshr);
	     break;
	 }
	 case EVENT_STORE: {
	     /* just waiting to finish the store */
	     WBuf_p wbuf = (WBuf_p)event->data;
	     add_delayed_events();
	     wbuf_free(wbuf);
	     break;
	 }
	 default: assert(!"Unknown cache event!"); break;
	}

	event_free(event);
    }

    add_delayed_events();
    cache_cycle = xCPU_cycle;
    finish_free();
}

/******************************************************************************
 * This is a never before seen load request.
 * Find given address in the L1 cache, else allocate an L1 MSHR
 * to manage the miss throught the rest of the memory heirarchy.
 */

MSHR_p
cache_load_start(ulong_t va, int first, short *delay, int inum)
{
    CacheEvent_p event;
    alarm_t alarm = 0;
    MSHR_p mshr = NULL;
    WBuf_p wbuf = NULL;
    int mshr_inum;
    int wbuf_inum;
    int ii;

    if(inum > 0) {
	if(va == 0) xmem_test_failed = 1;
	else {
	    xmem_test_flag = 1;
	    xmem_test_failed = 0;
	    xmem_test_address = (void*)va;
	    xmem_load8_test((void*)va);
	    xmem_test_flag = 0;
	}
	if(xmem_test_failed) {
	    ++count_cache_segfault;
	    *delay = -128; return NULL;
	}
    }

    va &= ~(L1_LINE_WIDTH-1);

    /**************************************************************************
     * Update cache state, advancing the simulation up
     * to the current execution time (ie, CPU_cycle). */
    cache_advance(inum);

    /* Conservatively avoid conflicts with data in the write-buffer.
     * Don't allow any conflicting loads until prior writes complete. */

    alarm = xCPU_cycle + 1;
    wbuf_inum = -NUM_ISTATES;
    for(ii=0; ii<NUM_WBUFs; ++ii) {
	WBuf_p _wbuf = &wbuf_list[ii];
	int _wbuf_inum = BUF_INUM(_wbuf);
	if(_wbuf->is_free) continue;
	if((_wbuf->address & ~(L1_LINE_WIDTH-1)) == va && _wbuf_inum <= inum) {
	    if(_wbuf->alarm > alarm) alarm = _wbuf->alarm;
	    if(_wbuf_inum > wbuf_inum) {
		wbuf_inum = _wbuf_inum;
		wbuf = _wbuf;
	    }
	}
    }

    /* Lookup address in the L1 cache. */

    if(!wbuf && lookup_l1(va)) {
#ifdef STATS
	++count_cache_load_hit_l1;
#endif
	*delay = CACHE_HIT_DELAY;
	return NULL;
    }

    /* L1 cache miss: First try coalescing this load
     * with a prior load request. */

    for(ii=0; ii<NUM_MSHRs; ++ii) {
	mshr = &mshr_list[ii];
	mshr_inum = BUF_INUM(mshr);
	if(mshr->is_free || (wbuf && mshr_inum < wbuf_inum))
	    continue;
	if(mshr->address == va && mshr_inum <= inum) {
	    *delay = alarm_to_delay(mshr_alarm(mshr));
#ifdef STATS
	    ++mshr->num_coalesced;
#endif
	    return mshr;
	}
    }

    /* Test if we can allocate a new L1 MSHR. */

    if(mshr_num_free < 1 || (!first && mshr_num_free <= 1)) {
	/* Cannot allocate an MSHR for the L1 cache;
	 * Return the minimum cycle delay until one becomes free. */
	alarm = ~(alarm_t)0;
	for(ii=0; ii<NUM_MSHRs; ++ii) {
	    mshr = &mshr_list[ii];
	    if(mshr->is_free) continue;
	    if(mshr_alarm(mshr) < alarm)
		alarm = mshr_alarm(mshr);
	}

	/* delay returned negative => operation aborted */
	*delay = -alarm_to_delay(alarm);
	return NULL;
    }

    if(!wbuf) alarm = xCPU_cycle + L1_MISS_DELAY;

    /* Allocate new L1 MSHR. */

    mshr = mshr_free_list; mshr_free_list = mshr->next_free;
    mshr->is_free = 0; mshr->ref_count = 1; --mshr_num_free;
    mshr->address = va & ~(L1_LINE_WIDTH-1);
    mshr->alarm = alarm;
    mshr->mshr2 = NULL;
    mshr->istart = istate_start;
    mshr->inum = inum;
    mshr->wait = wbuf ? 1 : 0;
#ifdef STATS
    mshr->num_coalesced = 1;
#endif

    /* Add a cache event to continue processing this load request. */

    event = event_alloc(alarm);
    event->kind = wbuf ? EVENT_LOAD_L1 : EVENT_LOAD_L2;
    event->data = mshr;

    /* Set the delay for an L1 cache miss,
     * and return the corresponding MSHR. */

    *delay = alarm_to_delay(alarm);
    return mshr;
}

/******************************************************************************
 * Delay has expired on an existing load; Advance cache simulation.
 */

int
cache_load_continue(MSHR_p mshr, short *delay)
{
    CacheEvent_p event;
    alarm_t alarm;
    WBuf_p wbuf;

    /**************************************************************************
     * Update cache state, advancing the simulation up
     * to the current execution time (ie, CPU_cycle). */
    cache_advance();

    alarm = mshr_alarm(mshr);
    assert(alarm >= xCPU_cycle);
    if(alarm == xCPU_cycle) {		/* if load is done */
	*delay = CACHE_HIT_DELAY;
	return 1;
    }

    /* Load still in progress */
    *delay = alarm_to_delay(alarm);
    return 0;
}

void
load_l1(MSHR_p mshr)
{
    CacheEvent_p event;
    WBuf_p wbuf = NULL;
    alarm_t alarm = 0;
    int wbuf_inum;
    int mshr_inum;
    int retry = 0;
    int ii;

    /* Verify that all conflicting stores have completed. */

    alarm = xCPU_cycle;
    wbuf_inum = -NUM_ISTATES;
    mshr_inum= BUF_INUM(mshr);
    for(ii=0; ii<NUM_WBUFs; ++ii) {
	WBuf_p _wbuf = &wbuf_list[ii];
	int _wbuf_inum = BUF_INUM(_wbuf);
	if(_wbuf->is_free) continue;
	if((_wbuf->address & ~(L1_LINE_WIDTH-1)) == mshr->address &&
	   _wbuf_inum <= mshr_inum) {
	    if(_wbuf->alarm > alarm) alarm = _wbuf->alarm;
	    if(_wbuf_inum > wbuf_inum) {
		wbuf_inum = _wbuf_inum;
		wbuf = _wbuf;
	    }
	}
    }

    if(wbuf) {
	/* Still waiting for conflicting stores */
	assert(!mshr->mshr2);
	if(!wbuf->alarm) ++alarm;
	retry = alarm == xCPU_cycle;

    } else {
	/* No more conflicting stores => start processing load. */

	if(lookup_l1(mshr->address)) {
	    /* Hit in L1 cache */
#ifdef STATS
	    count_cache_load_hit_l1 += mshr->num_coalesced;
#endif
	    mshr_free(mshr);
	    return;
	}

	/* Miss in L1 cache */
	alarm = xCPU_cycle + L1_MISS_DELAY;
    }

    /* Add a cache event to continue processing this load request. */

    mshr->alarm = alarm;
    mshr->wait = wbuf ? 1 : 0;
    if(!retry) event = event_alloc(alarm);
    else event = delayed_event_alloc(alarm);
    event->kind = wbuf ? EVENT_LOAD_L1 : EVENT_LOAD_L2;
    event->data = mshr;
}

void
load_l2(MSHR_p mshr)
{
    CacheEvent_p event;
    int mshr_inum;
    MSHR_p mshr2;
    int ii;

    /* Lookup load address in the L2 cache.
     * If found, then we are done with this load. */

    if(lookup_l2(mshr->address)) {
#ifdef STATS
	count_cache_load_hit_l2 += mshr->num_coalesced;
#endif
	update_l1(mshr->address);
	mshr_free(mshr);
	return;
    }

    /**************************************************************************
     * Load missed in the L2 cache.
     */

    /* Try to assign this load to an L2 MSHR. First test if it can be
     * coalesced into an existing L2 MSHR for an earlier instruction. */

    mshr_inum = BUF_INUM(mshr);
    for(ii=0; ii<NUM_MSHR2s; ++ii) {
	mshr2 = &mshr2_list[ii];
	if(mshr2->is_free) continue;
	if(mshr2->address == (mshr->address & ~(L2_LINE_WIDTH-1)) &&
	   BUF_INUM(mshr2) <= mshr_inum) {
	    mshr->mshr2 = mshr2;
	    ++mshr2->ref_count;
	    break;
	}
    }

    if(!mshr->mshr2) {
	/* Allocate a new L2 MSHR for this load.
	 * (There should always be a free L2 MSHR.) */
	mshr2 = mshr2_free_list; mshr2_free_list = mshr2->next_free;
	mshr2->is_free = 0; mshr2->ref_count = 1; --mshr2_num_free;
	mshr2->address = mshr->address & ~(L2_LINE_WIDTH-1);
	mshr2->alarm = cache_cycle + L2_MISS_DELAY;
	mshr2->istart = mshr->istart;
	mshr2->inum = mshr->inum;
	mshr->mshr2 = mshr2;
    }

    /* Add a cache event to continue processing this load request. */

    event = event_alloc(mshr_alarm(mshr));
    event->kind = EVENT_LOAD_MEM;
    event->data = mshr;
}

WBuf_p
cache_store_start(ulong_t va, int is_spec, short *delay, int inum)
{
    WBuf_p wbuf;
    int ii;

    va &= ~(WRITE_LINE_WIDTH-1);

    /**********************************************************************
     * Update cache state, advancing the simulation up
     * to the current execution time (ie, CPU_cycle). */
    cache_advance();

    /* Try to allocate a new write-buffer. */

    if(wbuf_num_free <= 0) {
	/* No free write-buffers; Delay 'til first free buffer. */
	alarm_t alarm = ~(alarm_t)0;
	for(ii=0; ii<NUM_WBUFs; ++ii) {
	    wbuf = &wbuf_list[ii];
	    if(wbuf->is_free) continue;
	    if(wbuf->alarm > 0 && wbuf->alarm < alarm)
		alarm = wbuf->alarm;
	}
	if(alarm == ~(alarm_t)0) *delay = -1;
	else {
	    assert(alarm > xCPU_cycle);
	    *delay = -alarm_to_delay(alarm);
	}
	return NULL;
    }

    /* Allocate a new write-buffer. */

    wbuf = wbuf_free_list; wbuf_free_list = wbuf->next_free;
    wbuf->is_free = 0; --wbuf_num_free;
    wbuf->address = va & ~(WRITE_LINE_WIDTH-1);
    wbuf->istart = istate_start;
    wbuf->inum = inum;
    wbuf->alarm = 0;

    if(is_spec) *delay = 0;
    else {
	ii = cache_commit(wbuf,delay);
	assert(ii || *delay > 0);
    }

    /* This store may conflict with executing loads that appear to
     * occur later in the instruction stream. This can happen if a
     * load was issued specualtively, then later rolled back. If a
     * conflict exists then cancel the load. */

    for(ii=0; ii<NUM_MSHRs; ++ii) {
	MSHR_p _mshr = &mshr_list[ii];
	if(_mshr->is_free) continue;
	if(_mshr->address == (va & ~(L1_LINE_WIDTH-1)) &&
	   BUF_INUM(_mshr) >= inum && !_mshr->wait) {
	    /* Remove pending events for this MSHR and free the MSHR */
	    CacheEvent_p event = eventQ.next_alloc;
	    for(event=eventQ.next_alloc; event!=&eventQ;
		event=event->next_alloc) {
		switch(event->kind) {
		 case EVENT_LOAD_L1:
		 case EVENT_LOAD_L2:
		 case EVENT_LOAD_MEM:
		    if((MSHR_p)event->data == _mshr)
			event_free(event);
		    break;
		 default: break;
		}
	    }
#ifdef STATS
	    count_cache_load_removed += _mshr->num_coalesced;
#endif
	    mshr_free(_mshr);
	}
    }

    return wbuf;
}

int
cache_commit(WBuf_p wbuf, short *delay)
{
    CacheEvent_p event;
    MSHR_p mshr = NULL;
    alarm_t alarm = 0;
    int wbuf_inum;
    int ii;

    /**************************************************************************
     * Update cache state, advancing the simulation up
     * to the current execution time (ie, CPU_cycle). */

    cache_advance();


    /* Make sure this store occurs after conflicting loads complete. */

    wbuf_inum = BUF_INUM(wbuf);
    for(ii=0; ii<NUM_MSHRs; ++ii) {
	MSHR_p _mshr = &mshr_list[ii];
	if(_mshr->is_free) continue;
	if(_mshr->address == (wbuf->address & ~(L1_LINE_WIDTH-1)) &&
	   BUF_INUM(_mshr) < wbuf_inum && mshr_alarm(_mshr) > alarm) {
	    mshr = _mshr; alarm = mshr_alarm(_mshr);
	}
    }

    if(mshr) {
	/* Conflicting load found */
	wbuf->alarm = alarm;
	*delay = alarm_to_delay(alarm);
	return 0;
    }

    /* Continue to simulate timing of store out to memory. */
    alarm = xCPU_cycle + WRITE_DELAY;
    wbuf->alarm = alarm;

    /* Add a cache event to continue processing this store. */
    event = event_alloc(alarm);
    event->kind = EVENT_STORE;
    event->data = wbuf;

    /* No conflicts => allow commit to succeed. */
    *delay = 0; return 1;
}

void
cache_rollback(WBuf_p wbuf)
{
    /**************************************************************************
     * Update cache state, advancing the simulation up
     * to the current execution time (ie, CPU_cycle). */
    cache_advance();

    /* free resources */
    wbuf_free(wbuf);
}

CacheEvent_p
event_top()
{
    CacheEvent_p event = eventQ.next_alloc;
    if(event == &eventQ) return 0;
    return event;
}

CacheEvent_p
event_alloc(alarm_t alarm)
{
    CacheEvent_p event, ee;

    assert(free_events);
    event = free_events;
    free_events = event->next_free;
    event->alarm = alarm;

    for(ee=eventQ.prev_alloc; ee!=&eventQ; ee=ee->prev_alloc)
	if(alarm >= ee->alarm) break;

    event->next_alloc = ee->next_alloc; event->prev_alloc = ee;
    event->next_alloc->prev_alloc = ee->next_alloc = event;
    return event;
}

void
event_free(CacheEvent_p event)
{
    event->next_alloc->prev_alloc = event->prev_alloc;
    event->prev_alloc->next_alloc = event->next_alloc;
    event->next_free = free_events;
    free_events = event;
}

CacheEvent_p
delayed_event_alloc(alarm_t alarm)
{
    CacheEvent_p event, ee;

    assert(free_events);
    event = free_events;
    free_events = event->next_free;
    event->alarm = alarm;

    for(ee=delayedQ.prev_alloc; ee!=&delayedQ; ee=ee->prev_alloc)
	if(alarm >= ee->alarm) break;

    event->next_alloc = ee->next_alloc; event->prev_alloc = ee;
    event->next_alloc->prev_alloc = ee->next_alloc = event;
    return event;
}

void
add_delayed_events()
{
    CacheEvent_p event, next, ee;
    for(event=delayedQ.next_alloc; event!=&delayedQ; event=next) {
	next = event->next_alloc;

	for(ee=eventQ.prev_alloc; ee!=&eventQ; ee=ee->prev_alloc)
	    if(event->alarm >= ee->alarm) break;

	event->next_alloc = ee->next_alloc; event->prev_alloc = ee;
	event->next_alloc->prev_alloc = ee->next_alloc = event;
    }

    delayedQ.next_alloc = delayedQ.prev_alloc = &delayedQ;
}

static ulong_t wbuf_num_to_free = 0;
static ulong_t mshr_num_to_free = 0;
static ulong_t mshr2_num_to_free = 0;
static WBuf_p wbuf_to_free_list = NULL;
static MSHR_p mshr_to_free_list = NULL;
static MSHR_p mshr2_to_free_list = NULL;
static WBuf_p *wbuf_to_free_list_end = &wbuf_to_free_list;
static MSHR_p *mshr_to_free_list_end = &mshr_to_free_list;
static MSHR_p *mshr2_to_free_list_end = &mshr2_to_free_list;

void
mshr_free(MSHR_p mshr)
{
    assert(!mshr->is_free && mshr->ref_count > 0);
    if(--mshr->ref_count > 0) return;

    if(mshr->mshr2) mshr2_free(mshr->mshr2);

    ++mshr_num_to_free;
    *mshr_to_free_list_end = mshr;
    mshr_to_free_list_end = &mshr->next_free;
    mshr->next_free = NULL;
    mshr->is_free = 1;
}

void
mshr2_free(MSHR_p mshr)
{
    assert(!mshr->is_free && mshr->ref_count > 0);
    if(--mshr->ref_count > 0) return;

    ++mshr2_num_to_free;
    *mshr2_to_free_list_end = mshr;
    mshr2_to_free_list_end = &mshr->next_free;
    mshr->next_free = NULL;
    mshr->is_free = 1;
}

void
wbuf_free(WBuf_p wbuf)
{
    assert(!wbuf->is_free);

    ++wbuf_num_to_free;
    *wbuf_to_free_list_end = wbuf;
    wbuf_to_free_list_end = &wbuf->next_free;
    wbuf->next_free = NULL;
    wbuf->is_free = 1;
}

void
finish_free()
{
    static u_longlong_t last_free_cycle = 0;
    if(cache_cycle <= last_free_cycle) return;
    last_free_cycle = cache_cycle;

    *mshr_to_free_list_end = mshr_free_list;
    mshr_to_free_list_end = &mshr_to_free_list;
    mshr_free_list = mshr_to_free_list;
    mshr_num_free += mshr_num_to_free;
    mshr_num_to_free = 0;

    *mshr2_to_free_list_end = mshr2_free_list;
    mshr2_to_free_list_end = &mshr2_to_free_list;
    mshr2_free_list = mshr2_to_free_list;
    mshr2_num_free += mshr2_num_to_free;
    mshr2_num_to_free = 0;

    *wbuf_to_free_list_end = wbuf_free_list;
    wbuf_to_free_list_end = &wbuf_to_free_list;
    wbuf_free_list = wbuf_to_free_list;
    wbuf_num_free += wbuf_num_to_free;
    wbuf_num_to_free = 0;
}

/******************************************************************************
 * The actual cache data structures and access functions.
 */

int
lookup_l1(ulong_t va)
{
    ulong_t line = (va % L1_BANK_SIZE) / L1_LINE_WIDTH;
    int ii;

    va &= ~(L1_LINE_WIDTH-1);
    for(ii=0; ii<L1_CACHE_ASSOC; ++ii) {
	if(!l1_cache[line][ii].is_free &&
	   l1_cache[line][ii].address == va) {
	    l1_cache[line][ii].touch = 1;
	    lookup_l2(va);
	    return 1;
	}
    }

    return 0;
}

void
update_l1(ulong_t va)
{
    ulong_t line = (va % L1_BANK_SIZE) / L1_LINE_WIDTH;
    int ii;

    va &= ~(L1_LINE_WIDTH-1);
    for(ii=0; ii<L1_CACHE_ASSOC; ++ii) {
	if(!l1_cache[line][ii].is_free &&
	   l1_cache[line][ii].address == va) {
	    return;
	}
    }

    for(ii = l1_clock[line];
	!l1_cache[line][ii].is_free && l1_cache[line][ii].touch;
	ii = (ii+1) % L1_CACHE_ASSOC) {
	l1_cache[line][ii].touch = 0;
    }

    l1_cache[line][ii].address = va;
    l1_cache[line][ii].is_free = 0;
    l1_cache[line][ii].touch = 0;
    l1_clock[line] = (ii+1) % L1_CACHE_ASSOC;
}

void
flush_line_l1(ulong_t va)
{
    ulong_t line = (va % L1_BANK_SIZE) / L1_LINE_WIDTH;
    int ii;

    va &= ~(L1_LINE_WIDTH-1);
    for(ii=0; ii<L1_CACHE_ASSOC; ++ii) {
	if(!l1_cache[line][ii].is_free &&
	   l1_cache[line][ii].address == va) {
	    l1_cache[line][ii].is_free = 1;
	    return;
	}
    }
}

int
lookup_l2(ulong_t va)
{
    ulong_t line = (va % L2_BANK_SIZE) / L2_LINE_WIDTH;
    int ii;

    va &= ~(L2_LINE_WIDTH-1);
    for(ii=0; ii<L2_CACHE_ASSOC; ++ii) {
	if(!l2_cache[line][ii].is_free &&
	   l2_cache[line][ii].address == va) {
	    l2_cache[line][ii].touch = 1;
	    return 1;
	}
    }

    return 0;
}

void
update_l2(ulong_t va)
{
    ulong_t line = (va % L2_BANK_SIZE) / L2_LINE_WIDTH;
    int ii;

    va &= ~(L2_LINE_WIDTH-1);
    for(ii=0; ii<L2_CACHE_ASSOC; ++ii) {
	if(!l2_cache[line][ii].is_free &&
	   l2_cache[line][ii].address == va) {
	    return;
	}
    }

    for(ii = l2_clock[line];
	!l2_cache[line][ii].is_free && l2_cache[line][ii].touch;
	ii = (ii+1) % L2_CACHE_ASSOC) {
	l2_cache[line][ii].touch = 0;
    }

    l2_cache[line][ii].address = va;
    l2_cache[line][ii].is_free = 0;
    l2_cache[line][ii].touch = 0;
    l2_clock[line] = (ii+1) % L2_CACHE_ASSOC;
}

void
flush_line_l2(ulong_t va)
{
    ulong_t line = (va % L2_BANK_SIZE) / L2_LINE_WIDTH;
    int ii;

    va &= ~(L2_LINE_WIDTH-1);
    for(ii=0; ii<L2_CACHE_ASSOC; ++ii) {
	if(!l2_cache[line][ii].is_free &&
	   l2_cache[line][ii].address == va) {
	    l2_cache[line][ii].is_free = 1;
	    return;
	}
    }
}