# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449097727 21600
# Node ID 6fa05c2ac585027a388950a6285d0c395ca9ab76
# Parent  39259afef4564d564f4aefd2b22cff2e17110c55
imported patch fix-regress

diff -r 39259afef456 -r 6fa05c2ac585 tests/regress.py
--- a/tests/regress.py	Fri Sep 19 00:22:52 2014 -0500
+++ b/tests/regress.py	Wed Dec 02 17:08:47 2015 -0600
@@ -182,7 +182,7 @@
 scons_opts += ' --ignore-style --no-lto EXTRAS=../gem5-gpu/src:../gpgpu-sim'
 
 for target in targets:
-    cmd = 'scons %s --default=../../gem5-gpu/build_opts/%s %s' % \
+    cmd = 'python /usr/bin/scons %s --default=../../gem5-gpu/build_opts/%s %s' % \
           (scons_opts, target[0], target[1])
     print "Building/Running scons command: %s\n" % cmd
     if options.no_exec:
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449097727 21600
# Node ID 2db8c41103a182f25d137cabed9e724b01be6878
# Parent  6fa05c2ac585027a388950a6285d0c395ca9ab76
imported patch fixes-for-gcc4.8

diff -r 6fa05c2ac585 -r 2db8c41103a1 src/gpu/shader_tlb.hh
--- a/src/gpu/shader_tlb.hh	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/gpu/shader_tlb.hh	Wed Dec 02 17:08:47 2015 -0600
@@ -54,6 +54,7 @@
 public:
     virtual bool lookup(Addr vpn, Addr& ppn, bool set_mru=true) = 0;
     virtual void insert(Addr vpn, Addr ppn) = 0;
+    virtual ~BaseTLBMemory() {}
 };
 
 class TLBMemory : public BaseTLBMemory {
diff -r 6fa05c2ac585 -r 2db8c41103a1 src/mem/ruby/RubySlicc_GPUMappings.hh
--- a/src/mem/ruby/RubySlicc_GPUMappings.hh	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/mem/ruby/RubySlicc_GPUMappings.hh	Wed Dec 02 17:08:47 2015 -0600
@@ -40,7 +40,7 @@
 inline MachineID
 getL2ID(Address addr, int num_l2, int select_bits, int select_start_bit)
 {
-    int num = 0;
+    unsigned int num = 0;
     if (select_bits) {
         if (num_l2 > pow(2, select_bits))
             fatal("Number of GPU L2 select bits set incorrectly?");
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449097727 21600
# Node ID dbcd4579a673931bb7f07c9328c8d66fc37f179f
# Parent  2db8c41103a182f25d137cabed9e724b01be6878
Updates to work with gem5 version 10451:3a87241adfb8
Updates head file placement and slightly changed TLB::finish params

diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/MESI_Two_Level_fusion.py
--- a/configs/gpu_protocol/MESI_Two_Level_fusion.py	Wed Dec 02 17:08:47 2015 -0600
+++ b/configs/gpu_protocol/MESI_Two_Level_fusion.py	Wed Dec 02 17:08:47 2015 -0600
@@ -100,6 +100,8 @@
                                           options.cpu_type == "detailed"),
                                       prefetcher = prefetcher,
                                       ruby_system = ruby_system,
+                                      #clk_domain=system.cpu[i].clk_domain,
+                                      transitions_per_cycle=options.ports,
                                       enable_prefetch = False)
 
         cpu_seq = RubySequencer(version = options.num_cpus + i,
@@ -107,6 +109,7 @@
                                 dcache = l1d_cache,
                                 access_phys_mem = True,
                                 max_outstanding_requests = options.gpu_l1_buf_depth,
+                                #clk_domain=system.cpu[i].clk_domain,
                                 ruby_system = ruby_system,
                                 connect_to_io = False)
 
@@ -120,6 +123,13 @@
         cpu_sequencers.append(cpu_seq)
         topology.addController(l1_cntrl)
 
+        l1_cntrl.requestFromL1Cache =  ruby_system.network.slave
+        l1_cntrl.responseFromL1Cache =  ruby_system.network.slave
+        l1_cntrl.unblockFromL1Cache =  ruby_system.network.slave
+
+        l1_cntrl.requestToL1Cache =  ruby_system.network.master
+        l1_cntrl.responseToL1Cache =  ruby_system.network.master
+
         cntrl_count += 1
 
     ############################################################################
@@ -170,6 +180,13 @@
 
     topology.addController(l1_cntrl)
 
+    l1_cntrl.requestFromL1Cache =  ruby_system.network.slave
+    l1_cntrl.responseFromL1Cache =  ruby_system.network.slave
+    l1_cntrl.unblockFromL1Cache =  ruby_system.network.slave
+
+    l1_cntrl.requestToL1Cache =  ruby_system.network.master
+    l1_cntrl.responseToL1Cache =  ruby_system.network.master
+
 
     # Copy engine cache (make as small as possible, ideally 0)
     l1i_cache = L1Cache(size = "2kB", assoc = 2)
@@ -205,4 +222,11 @@
     cpu_sequencers.append(cpu_seq)
     topology.addController(l1_cntrl)
 
+    l1_cntrl.requestFromL1Cache =  ruby_system.network.slave
+    l1_cntrl.responseFromL1Cache =  ruby_system.network.slave
+    l1_cntrl.unblockFromL1Cache =  ruby_system.network.slave
+
+    l1_cntrl.requestToL1Cache =  ruby_system.network.master
+    l1_cntrl.responseToL1Cache =  ruby_system.network.master
+
     return (cpu_sequencers, dir_cntrls, topology)
diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/MI_example_fusion.py
--- a/configs/gpu_protocol/MI_example_fusion.py	Wed Dec 02 17:08:47 2015 -0600
+++ b/configs/gpu_protocol/MI_example_fusion.py	Wed Dec 02 17:08:47 2015 -0600
@@ -100,6 +100,12 @@
         cpu_sequencers.append(cpu_seq)
         topology.addController(l1_cntrl)
 
+        # Connect the L1 controllers and the network
+        l1_cntrl.requestFromCache =  ruby_system.network.slave
+        l1_cntrl.responseFromCache =  ruby_system.network.slave
+        l1_cntrl.forwardToCache =  ruby_system.network.master
+        l1_cntrl.responseToCache =  ruby_system.network.master
+
     ############################################################################
     # Pagewalk cache
     # NOTE: We use a CPU L1 cache controller here. This is to facilatate MMU
@@ -136,6 +142,12 @@
 
     topology.addController(l1_cntrl)
 
+    # Connect the L1 controllers and the network
+    l1_cntrl.requestFromCache =  ruby_system.network.slave
+    l1_cntrl.responseFromCache =  ruby_system.network.slave
+    l1_cntrl.forwardToCache =  ruby_system.network.master
+    l1_cntrl.responseToCache =  ruby_system.network.master
+
     #copy engine cache (make as small as possible, ideally 0)
     cache = Cache(size = "4kB", assoc = 2)
 
@@ -164,4 +176,10 @@
     cpu_sequencers.append(cpu_seq)
     topology.addController(l1_cntrl)
 
+    # Connect the L1 controllers and the network
+    l1_cntrl.requestFromCache =  ruby_system.network.slave
+    l1_cntrl.responseFromCache =  ruby_system.network.slave
+    l1_cntrl.forwardToCache =  ruby_system.network.master
+    l1_cntrl.responseToCache =  ruby_system.network.master
+
     return cpu_sequencers, dir_cntrls, topology
diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/MOESI_hammer_fusion.py
--- a/configs/gpu_protocol/MOESI_hammer_fusion.py	Wed Dec 02 17:08:47 2015 -0600
+++ b/configs/gpu_protocol/MOESI_hammer_fusion.py	Wed Dec 02 17:08:47 2015 -0600
@@ -94,6 +94,8 @@
                                         options.allow_atomic_migration,
                                       send_evictions = (
                                           options.cpu_type == "detailed"),
+                                      transitions_per_cycle = options.ports,
+                                      #clk_domain=system.cpu[i].clk_domain,
                                       ruby_system = ruby_system)
 
         cpu_seq = RubySequencer(version = options.num_cpus + i,
@@ -101,10 +103,13 @@
                                 dcache = l1d_cache,
                                 access_phys_mem = True,
                                 max_outstanding_requests = options.gpu_l1_buf_depth,
+                                #clk_domain=system.cpu[i].clk_domain,
                                 ruby_system = ruby_system,
                                 connect_to_io = False)
 
         l1_cntrl.sequencer = cpu_seq
+        if options.recycle_latency:
+            l1_cntrl.recycle_latency = options.recycle_latency
 
         exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i)
 
@@ -114,6 +119,16 @@
         cpu_sequencers.append(cpu_seq)
         topology.addController(l1_cntrl)
 
+        # Connect the L1 controller and the network
+        # Connect the buffers from the controller to network
+        l1_cntrl.requestFromCache = ruby_system.network.slave
+        l1_cntrl.responseFromCache = ruby_system.network.slave
+        l1_cntrl.unblockFromCache = ruby_system.network.slave
+
+        # Connect the buffers from the network to the controller
+        l1_cntrl.forwardToCache = ruby_system.network.master
+        l1_cntrl.responseToCache = ruby_system.network.master
+
         cntrl_count += 1
 
     ############################################################################
@@ -169,6 +184,16 @@
 
     topology.addController(l1_cntrl)
 
+    # Connect the L1 controller and the network
+    # Connect the buffers from the controller to network
+    l1_cntrl.requestFromCache = ruby_system.network.slave
+    l1_cntrl.responseFromCache = ruby_system.network.slave
+    l1_cntrl.unblockFromCache = ruby_system.network.slave
+
+    # Connect the buffers from the network to the controller
+    l1_cntrl.forwardToCache = ruby_system.network.master
+    l1_cntrl.responseToCache = ruby_system.network.master
+
     # Copy engine cache (make as small as possible, ideally 0)
     l1i_cache = L1Cache(size = "2kB", assoc = 2)
     l1d_cache = L1Cache(size = "2kB", assoc = 2)
@@ -204,4 +229,14 @@
     cpu_sequencers.append(cpu_seq)
     topology.addController(l1_cntrl)
 
+    # Connect the L1 controller and the network
+    # Connect the buffers from the controller to network
+    l1_cntrl.requestFromCache = ruby_system.network.slave
+    l1_cntrl.responseFromCache = ruby_system.network.slave
+    l1_cntrl.unblockFromCache = ruby_system.network.slave
+
+    # Connect the buffers from the network to the controller
+    l1_cntrl.forwardToCache = ruby_system.network.master
+    l1_cntrl.responseToCache = ruby_system.network.master
+
     return (cpu_sequencers, dir_cntrl_nodes, topology)
diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/MOESI_hammer_split.py
--- a/configs/gpu_protocol/MOESI_hammer_split.py	Wed Dec 02 17:08:47 2015 -0600
+++ b/configs/gpu_protocol/MOESI_hammer_split.py	Wed Dec 02 17:08:47 2015 -0600
@@ -102,4 +102,14 @@
     cpu_sequencers.append(gpu_ce_seq)
     topology.addController(l1_cntrl)
 
+    # Connect the L1 controller and the network
+    # Connect the buffers from the controller to network
+    l1_cntrl.requestFromCache = ruby_system.network.slave
+    l1_cntrl.responseFromCache = ruby_system.network.slave
+    l1_cntrl.unblockFromCache = ruby_system.network.slave
+
+    # Connect the buffers from the network to the controller
+    l1_cntrl.forwardToCache = ruby_system.network.master
+    l1_cntrl.responseToCache = ruby_system.network.master
+
     return (cpu_sequencers, dir_cntrl_nodes, topology)
diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/VI_hammer.py
--- a/configs/gpu_protocol/VI_hammer.py	Wed Dec 02 17:08:47 2015 -0600
+++ b/configs/gpu_protocol/VI_hammer.py	Wed Dec 02 17:08:47 2015 -0600
@@ -121,6 +121,16 @@
         cpu_sequencers.append(cpu_seq)
         topology.add(l1_cntrl)
 
+        # Connect the L1 controller and the network
+        # Connect the buffers from the controller to network
+        l1_cntrl.requestFromCache = ruby_system.network.slave
+        l1_cntrl.responseFromCache = ruby_system.network.slave
+        l1_cntrl.unblockFromCache = ruby_system.network.slave
+
+        # Connect the buffers from the network to the controller
+        l1_cntrl.forwardToCache = ruby_system.network.master
+        l1_cntrl.responseToCache = ruby_system.network.master
+
     cpu_mem_range = AddrRange(options.total_mem_size)
     mem_module_size = cpu_mem_range.size() / options.num_dirs
 
@@ -183,6 +193,16 @@
         exec("ruby_system.dir_cntrl%d = dir_cntrl" % i)
         dir_cntrl_nodes.append(dir_cntrl)
 
+        # Connect the directory controller to the network
+        dir_cntrl.forwardFromDir = ruby_system.network.slave
+        dir_cntrl.responseFromDir = ruby_system.network.slave
+        dir_cntrl.dmaResponseFromDir = ruby_system.network.slave
+
+        dir_cntrl.unblockToDir = ruby_system.network.master
+        dir_cntrl.responseToDir = ruby_system.network.master
+        dir_cntrl.requestToDir = ruby_system.network.master
+        dir_cntrl.dmaRequestToDir = ruby_system.network.master
+
     dma_cntrl_nodes = []
     for i, dma_port in enumerate(dma_ports):
         #
@@ -202,4 +222,8 @@
         if options.recycle_latency:
             dma_cntrl.recycle_latency = options.recycle_latency
 
+        # Connect the dma controller to the network
+        dma_cntrl.responseFromDir = ruby_system.network.master
+        dma_cntrl.requestToDir = ruby_system.network.slave
+
     return (cpu_sequencers, dir_cntrl_nodes, dma_cntrl_nodes, topology)
diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/VI_hammer_fusion.py
--- a/configs/gpu_protocol/VI_hammer_fusion.py	Wed Dec 02 17:08:47 2015 -0600
+++ b/configs/gpu_protocol/VI_hammer_fusion.py	Wed Dec 02 17:08:47 2015 -0600
@@ -120,6 +120,11 @@
         all_sequencers.append(gpu_seq)
         gpu_cluster.add(l1_cntrl)
 
+        # Connect the controller to the network
+        l1_cntrl.requestFromL1Cache = ruby_system.network.slave
+        l1_cntrl.atomicRequestFromL1Cache = ruby_system.network.slave
+        l1_cntrl.responseToL1Cache = ruby_system.network.master
+
     l2_index_start = block_size_bits + l2_bits
     # Use L2 cache and interconnect latencies to calculate protocol latencies
     # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles
@@ -155,6 +160,17 @@
         gpu_cluster.add(l2_cluster)
         l2_clusters.append(l2_cluster)
 
+        # Connect the controller to the network
+        l2_cntrl.responseToL1Cache = ruby_system.network.slave
+        l2_cntrl.requestFromCache = ruby_system.network.slave
+        l2_cntrl.responseFromCache = ruby_system.network.slave
+        l2_cntrl.unblockFromCache = ruby_system.network.slave
+
+        l2_cntrl.requestFromL1Cache = ruby_system.network.master
+        l2_cntrl.atomicRequestFromL1Cache = ruby_system.network.master
+        l2_cntrl.forwardToCache = ruby_system.network.master
+        l2_cntrl.responseToCache = ruby_system.network.master
+
     ############################################################################
     # Pagewalk cache
     # NOTE: We use a CPU L1 cache controller here. This is to facilatate MMU
@@ -210,6 +226,16 @@
 
     gpu_cluster.add(l1_cntrl)
 
+    # Connect the L1 controller and the network
+    # Connect the buffers from the controller to network
+    l1_cntrl.requestFromCache = ruby_system.network.slave
+    l1_cntrl.responseFromCache = ruby_system.network.slave
+    l1_cntrl.unblockFromCache = ruby_system.network.slave
+
+    # Connect the buffers from the network to the controller
+    l1_cntrl.forwardToCache = ruby_system.network.master
+    l1_cntrl.responseToCache = ruby_system.network.master
+
 
     #
     # Create controller for the copy engine to connect to in GPU cluster
@@ -235,6 +261,9 @@
 
     all_sequencers.append(gpu_ce_seq)
 
+    gpu_ce_cntrl.responseFromDir = ruby_system.network.master
+    gpu_ce_cntrl.reqToDirectory = ruby_system.network.slave
+
     complete_cluster = Cluster(intBW = 32, extBW = 32)
     complete_cluster.add(gpu_ce_cntrl)
     complete_cluster.add(cpu_cluster)
diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/VI_hammer_split.py
--- a/configs/gpu_protocol/VI_hammer_split.py	Wed Dec 02 17:08:47 2015 -0600
+++ b/configs/gpu_protocol/VI_hammer_split.py	Wed Dec 02 17:08:47 2015 -0600
@@ -151,6 +151,11 @@
         all_sequencers.append(gpu_seq)
         gpu_cluster.add(l1_cntrl)
 
+        # Connect the controller to the network
+        l1_cntrl.requestFromL1Cache = ruby_system.network.master
+        l1_cntrl.atomicRequestFromL1Cache = ruby_system.network.master
+        l1_cntrl.responseToL1Cache = ruby_system.network.slave
+
     l2_index_start = block_size_bits + l2_bits
     # Use L2 cache and interconnect latencies to calculate protocol latencies
     # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles
@@ -186,6 +191,16 @@
         gpu_cluster.add(l2_cluster)
         l2_clusters.append(l2_cluster)
 
+        # Connect the controller to the network
+        l2_cntrl.responseToL1Cache = ruby_system.network.master
+        l2_cntrl.requestFromCache = ruby_system.network.master
+        l2_cntrl.responseFromCache = ruby_system.network.master
+        l2_cntrl.unblockFromCache = ruby_system.network.master
+        l2_cntrl.requestFromL1Cache = ruby_system.network.slave
+        l2_cntrl.atomicRequestFromL1Cache = ruby_system.network.slave
+        l2_cntrl.forwardToCache = ruby_system.network.slave
+        l2_cntrl.responseToCache = ruby_system.network.slave
+
     gpu_phys_mem_size = system.gpu_physmem.range.size()
 
     if options.num_dev_dirs > 0:
@@ -254,6 +269,16 @@
 
             exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i)
             dir_cntrls.append(dev_dir_cntrl)
+
+            # Connect the directory controller to the network
+            dir_cntrl.forwardFromDir = ruby_system.network.slave
+            dir_cntrl.responseFromDir = ruby_system.network.slave
+            dir_cntrl.dmaResponseFromDir = ruby_system.network.slave
+
+            dir_cntrl.unblockToDir = ruby_system.network.master
+            dir_cntrl.responseToDir = ruby_system.network.master
+            dir_cntrl.requestToDir = ruby_system.network.master
+            dir_cntrl.dmaRequestToDir = ruby_system.network.master
     else:
         # Since there are no device directories, use CPU directories
         # Fix up the memory sizes of the CPU directories
@@ -288,6 +313,9 @@
     all_sequencers.append(cpu_ce_seq)
     all_sequencers.append(gpu_ce_seq)
 
+    gpu_ce_cntrl.responseFromDir = ruby_system.network.slave
+    gpu_ce_cntrl.reqToDirectory = ruby_system.network.master
+
     complete_cluster = Cluster(intBW = 32, extBW = 32)
     complete_cluster.add(cpu_ce_cntrl)
     complete_cluster.add(gpu_ce_cntrl)
diff -r 2db8c41103a1 -r dbcd4579a673 src/gpu/shader_mmu.hh
--- a/src/gpu/shader_mmu.hh	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/gpu/shader_mmu.hh	Wed Dec 02 17:08:47 2015 -0600
@@ -99,7 +99,7 @@
                            BaseTLB::Mode _mode, ThreadContext *_tc,
                            bool prefetch=false);
         void markDelayed() { wrappedTranslation->markDelayed(); }
-        void finish(Fault fault, RequestPtr _req, ThreadContext *_tc,
+        void finish(const Fault &fault, RequestPtr _req, ThreadContext *_tc,
                     BaseTLB::Mode _mode)
         {
             assert(_mode == mode);
diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/protocol/VI-ce.sm
--- a/src/mem/protocol/VI-ce.sm	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/mem/protocol/VI-ce.sm	Wed Dec 02 17:08:47 2015 -0600
@@ -1,12 +1,14 @@
 
 machine(GPUCopyDMA, "VI Copy Engine Controller")
-: Sequencer * sequencer,
-  Cycles request_latency = 6
+: Sequencer * sequencer;
+  Cycles request_latency := 6;
+
+  MessageBuffer * responseFromDir, network="From", virtual_network="1",
+        ordered="true", vnet_type="response";
+  MessageBuffer * reqToDirectory, network="To", virtual_network="0",
+        ordered="true", vnet_type="request";
+
 {
-
-  MessageBuffer responseFromDir, network="From", virtual_network="1", ordered="true", vnet_type="response";
-  MessageBuffer reqToDirectory, network="To", virtual_network="0", ordered="true", vnet_type="request";
-
   state_declaration(State, desc="CE states", default="GPUCopyDMA_State_READY") {
     READY, AccessPermission:Invalid, desc="Ready to accept a new request";
     BUSY_RD, AccessPermission:Busy, desc="Busy: currently processing a request";
diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/protocol/VI_hammer-CPUCache.sm
--- a/src/mem/protocol/VI_hammer-CPUCache.sm	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/mem/protocol/VI_hammer-CPUCache.sm	Wed Dec 02 17:08:47 2015 -0600
@@ -34,25 +34,29 @@
  */
 
 machine({L1Cache, L2Cache}, "AMD Hammer-like protocol")
-: Sequencer * sequencer,
-  CacheMemory * L1Icache,
-  CacheMemory * L1Dcache,
-  CacheMemory * L2cache,
-  Cycles cache_response_latency = 10,
-  Cycles issue_latency = 1,
-  Cycles l2_cache_hit_latency = 15,
-  bool no_mig_atomic = true,
-  bool send_evictions
-{
+: Sequencer * sequencer;
+  CacheMemory * L1Icache;
+  CacheMemory * L1Dcache;
+  CacheMemory * L2cache;
+  Cycles cache_response_latency := 10;
+  Cycles issue_latency := 1;
+  Cycles l2_cache_hit_latency := 15;
+  bool no_mig_atomic := "True";
+  bool send_evictions;
 
   // NETWORK BUFFERS
-  MessageBuffer requestFromCache, network="To", virtual_network="2", ordered="false", vnet_type="request";
-  MessageBuffer responseFromCache, network="To", virtual_network="4", ordered="false", vnet_type="response";
-  MessageBuffer unblockFromCache, network="To", virtual_network="5", ordered="false", vnet_type="unblock";
+  MessageBuffer * requestFromCache, network="To", virtual_network="2",
+        ordered="false", vnet_type="request";
+  MessageBuffer * responseFromCache, network="To", virtual_network="4",
+        ordered="false", vnet_type="response";
+  MessageBuffer * unblockFromCache, network="To", virtual_network="5",
+        ordered="false", vnet_type="unblock";
 
-  MessageBuffer forwardToCache, network="From", virtual_network="3", ordered="false", vnet_type="forward";
-  MessageBuffer responseToCache, network="From", virtual_network="4", ordered="false", vnet_type="response";
-
+  MessageBuffer * forwardToCache, network="From", virtual_network="3",
+        ordered="false", vnet_type="forward";
+  MessageBuffer * responseToCache, network="From", virtual_network="4",
+        ordered="false", vnet_type="response";
+{
 
   // STATES
   state_declaration(State, desc="Cache states", default="L1Cache_State_I") {
diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/protocol/VI_hammer-GPUL1cache.sm
--- a/src/mem/protocol/VI_hammer-GPUL1cache.sm	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/mem/protocol/VI_hammer-GPUL1cache.sm	Wed Dec 02 17:08:47 2015 -0600
@@ -1,18 +1,22 @@
 
 machine(GPUL1Cache, "VI GPU L1 Cache")
-: Sequencer * sequencer,
-  CacheMemory * cache,
-  int l2_select_num_bits,
-  int num_l2,
-  Cycles issue_latency = 416,
-{
+: Sequencer * sequencer;
+  CacheMemory * cache;
+  int l2_select_num_bits;
+  int num_l2;
+  Cycles issue_latency := 416;
+
 
   // NETWORK BUFFERS
-  MessageBuffer requestFromL1Cache, network="To", virtual_network="7", ordered="true", vnet_type="request";
-  MessageBuffer atomicRequestFromL1Cache, network="To", virtual_network="8", ordered="true", vnet_type="request";
+  MessageBuffer * requestFromL1Cache, network="To", virtual_network="7",
+        ordered="true", vnet_type="request";
+  MessageBuffer * atomicRequestFromL1Cache, network="To", virtual_network="8",
+        ordered="true", vnet_type="request";
 
-  MessageBuffer responseToL1Cache, network="From", virtual_network="6", ordered="true", vnet_type="response";
+  MessageBuffer * responseToL1Cache, network="From", virtual_network="6",
+        ordered="true", vnet_type="response";
 
+{
   // STATES
   state_declaration(State, desc="Cache states") {
     I, AccessPermission:Invalid, desc="Not Present/Invalid";
diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/protocol/VI_hammer-GPUL2cache.sm
--- a/src/mem/protocol/VI_hammer-GPUL2cache.sm	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/mem/protocol/VI_hammer-GPUL2cache.sm	Wed Dec 02 17:08:47 2015 -0600
@@ -1,28 +1,36 @@
 
 machine(GPUL2Cache, "Simple write back L2 cache")
-  : CacheMemory * L2cache,
-  Cycles l2_request_latency = 260,
-  Cycles l2_response_latency = 2,
-  Cycles cache_response_latency = 260,
-{
+  : CacheMemory * L2cache;
+  Cycles l2_request_latency := 260;
+  Cycles l2_response_latency := 2;
+  Cycles cache_response_latency := 260;
 
 //Note: we might have a problem if two Get atomics arrive from different L1's at the same time
 
 
   // NETWORK BUFFERS
   // Buffers to and from L1 caches
-  MessageBuffer requestFromL1Cache, network="From", virtual_network="7", ordered="true", vnet_type="request";
-  MessageBuffer responseToL1Cache, network="To", virtual_network="6", ordered="true", vnet_type="response";
-  MessageBuffer atomicRequestFromL1Cache, network="From", virtual_network="8", ordered="true", vnet_type="request";
+  MessageBuffer * requestFromL1Cache, network="From", virtual_network="7",
+        ordered="true", vnet_type="request";
+  MessageBuffer * responseToL1Cache, network="To", virtual_network="6",
+        ordered="true", vnet_type="response";
+  MessageBuffer * atomicRequestFromL1Cache, network="From", virtual_network="8",
+        ordered="true", vnet_type="request";
 
   // Buffers to / from the dir and other caches
-  MessageBuffer requestFromCache, network="To", virtual_network="2", ordered="false", vnet_type="request";
-  MessageBuffer responseFromCache, network="To", virtual_network="4", ordered="false", vnet_type="response";
-  MessageBuffer unblockFromCache, network="To", virtual_network="5", ordered="false", vnet_type="unblock";
+  MessageBuffer * requestFromCache, network="To", virtual_network="2",
+        ordered="false", vnet_type="request";
+  MessageBuffer * responseFromCache, network="To", virtual_network="4",
+        ordered="false", vnet_type="response";
+  MessageBuffer * unblockFromCache, network="To", virtual_network="5",
+        ordered="false", vnet_type="unblock";
 
-  MessageBuffer forwardToCache, network="From", virtual_network="3", ordered="false", vnet_type="forward";
-  MessageBuffer responseToCache, network="From", virtual_network="4", ordered="false", vnet_type="response";
+  MessageBuffer * forwardToCache, network="From", virtual_network="3",
+        ordered="false", vnet_type="forward";
+  MessageBuffer * responseToCache, network="From", virtual_network="4",
+        ordered="false", vnet_type="response";
 
+{
   // STATES
   state_declaration(State, desc="Cache states") {
     I, AccessPermission:Invalid, desc="Idle";
diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/protocol/VI_hammer-dir.sm
--- a/src/mem/protocol/VI_hammer-dir.sm	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/mem/protocol/VI_hammer-dir.sm	Wed Dec 02 17:08:47 2015 -0600
@@ -34,28 +34,35 @@
  */
 
 machine(Directory, "AMD Hammer-like protocol")
-: DirectoryMemory * directory,
-  CacheMemory * probeFilter,
-  MemoryControl * memBuffer,
-  Cycles memory_controller_latency = 12,
-  bool probe_filter_enabled = false,
-  bool full_bit_dir_enabled = false
-{
+: DirectoryMemory * directory;
+  CacheMemory * probeFilter;
+  MemoryControl * memBuffer;
+  Cycles memory_controller_latency := 12;
+  bool probe_filter_enabled := "False";
+  bool full_bit_dir_enabled := "False";
 
-  MessageBuffer forwardFromDir, network="To", virtual_network="3", ordered="false", vnet_type="forward";
-  MessageBuffer responseFromDir, network="To", virtual_network="4", ordered="false", vnet_type="response";
+  MessageBuffer * forwardFromDir, network="To", virtual_network="3",
+        ordered="false", vnet_type="forward";
+  MessageBuffer * responseFromDir, network="To", virtual_network="4",
+        ordered="false", vnet_type="response";
   //
   // For a finite buffered network, note that the DMA response network only
   // works at this relatively lower numbered (lower priority) virtual network
   // because the trigger queue decouples cache responses from DMA responses.
   //
-  MessageBuffer dmaResponseFromDir, network="To", virtual_network="1", ordered="true", vnet_type="response";
+  MessageBuffer * dmaResponseFromDir, network="To", virtual_network="1",
+        ordered="true", vnet_type="response";
 
-  MessageBuffer unblockToDir, network="From", virtual_network="5", ordered="false", vnet_type="unblock";
-  MessageBuffer responseToDir, network="From", virtual_network="4", ordered="false", vnet_type="response";
-  MessageBuffer requestToDir, network="From", virtual_network="2", ordered="false", vnet_type="request", recycle_latency="1";
-  MessageBuffer dmaRequestToDir, network="From", virtual_network="0", ordered="true", vnet_type="request";
+  MessageBuffer * unblockToDir, network="From", virtual_network="5",
+        ordered="false", vnet_type="unblock";
+  MessageBuffer * responseToDir, network="From", virtual_network="4",
+        ordered="false", vnet_type="response";
+  MessageBuffer * requestToDir, network="From", virtual_network="2",
+        ordered="false", vnet_type="request", recycle_latency="1";
+  MessageBuffer * dmaRequestToDir, network="From", virtual_network="0",
+        ordered="true", vnet_type="request";
 
+{
   // STATES
   state_declaration(State, desc="Directory states", default="Directory_State_E") {
     // Base states
diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/protocol/VI_hammer-dma.sm
--- a/src/mem/protocol/VI_hammer-dma.sm	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/mem/protocol/VI_hammer-dma.sm	Wed Dec 02 17:08:47 2015 -0600
@@ -28,13 +28,15 @@
 
 
 machine(DMA, "DMA Controller") 
-: DMASequencer * dma_sequencer,
-  Cycles request_latency = 6
+: DMASequencer * dma_sequencer;
+  Cycles request_latency := 6;
+
+  MessageBuffer * responseFromDir, network="From", virtual_network="1",
+        ordered="true", vnet_type="response", no_vector="true";
+  MessageBuffer * reqToDirectory, network="To", virtual_network="0",
+        ordered="false", vnet_type="request", no_vector="true";
+
 {
-
-  MessageBuffer responseFromDir, network="From", virtual_network="1", ordered="true", vnet_type="response", no_vector="true";
-  MessageBuffer reqToDirectory, network="To", virtual_network="0", ordered="false", vnet_type="request", no_vector="true";
-
   state_declaration(State, 
                     desc="DMA states", 
                     default="DMA_State_READY") {
diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/ruby/RubySlicc_GPUMappings.hh
--- a/src/mem/ruby/RubySlicc_GPUMappings.hh	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/mem/ruby/RubySlicc_GPUMappings.hh	Wed Dec 02 17:08:47 2015 -0600
@@ -29,13 +29,14 @@
 #ifndef __MEM_RUBY_SLICC_GPUMAPPINGS_HH__
 #define __MEM_RUBY_SLICC_GPUMAPPINGS_HH__
 
-#include <math.h>
+#include <cmath>
+
 #include "mem/protocol/MachineType.hh"
 #include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/Global.hh"
+#include "mem/ruby/common/MachineID.hh"
 #include "mem/ruby/common/NetDest.hh"
-#include "mem/ruby/system/DirectoryMemory.hh"
-#include "mem/ruby/system/MachineID.hh"
+#include "mem/ruby/structures/DirectoryMemory.hh"
 
 inline MachineID
 getL2ID(Address addr, int num_l2, int select_bits, int select_start_bit)
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449097728 21600
# Node ID 3ee9d80f490fad33e0fa9c18fdba958942d3c63b
# Parent  dbcd4579a673931bb7f07c9328c8d66fc37f179f
Add the ability to do TLB shootdown. This also adds a hacky way to test TLB shootdown

diff -r dbcd4579a673 -r 3ee9d80f490f configs/fs_fusion.py
--- a/configs/fs_fusion.py	Wed Dec 02 17:08:47 2015 -0600
+++ b/configs/fs_fusion.py	Wed Dec 02 17:08:48 2015 -0600
@@ -141,6 +141,9 @@
     system.gpu_physmem = SimpleMemory(range = gpu_mem_range)
     system.gpu_physmem.port = system.iobus.master
 
+system.gpu.test_tlb_shootdown = True
+system.gpu.tlb_shootdown_tick = 5461844154685 + (29247313 / 4)
+
 #
 # Setup Ruby
 #
diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/gpgpu-sim/CudaGPU.py
--- a/src/gpu/gpgpu-sim/CudaGPU.py	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/gpu/gpgpu-sim/CudaGPU.py	Wed Dec 02 17:08:48 2015 -0600
@@ -55,3 +55,7 @@
     gpu_memory_range = Param.AddrRange(AddrRange('1kB'), "The address range for the GPU memory space")
 
     shader_mmu = Param.ShaderMMU(ShaderMMU(), "Memory managment unit for this GPU")
+
+    test_tlb_shootdown = Param.Bool(False, "If true, insert a shootdown event")
+    tlb_shootdown_tick = Param.Tick(0, "Relative tick after restore to issue the shootdown")
+
diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/gpgpu-sim/cuda_core.cc
--- a/src/gpu/gpgpu-sim/cuda_core.cc	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/gpu/gpgpu-sim/cuda_core.cc	Wed Dec 02 17:08:48 2015 -0600
@@ -60,6 +60,10 @@
     warpSize = cudaGPU->getWarpSize();
 
     signalKernelFinish = false;
+    signalFlushFinish = false;
+    flushFinished = false;
+
+    memoryPaused = false;
 
     if (p->port_lsq_port_connection_count != warpSize) {
         panic("Shader core lsq_port size != to warp size\n");
@@ -247,6 +251,11 @@
 bool
 CudaCore::executeMemOp(const warp_inst_t &inst)
 {
+    if (memoryPaused) {
+        // return true: there should be a pipeline stall
+        return true;
+    }
+
     assert(inst.space.get_type() == global_space ||
            inst.space.get_type() == const_space ||
            inst.op == BARRIER_OP ||
@@ -397,6 +406,12 @@
             shaderImpl->finish_kernel();
             signalKernelFinish = false;
         }
+        if (signalFlushFinish) {
+            flushFinished = true;
+            cudaGPU->cudaCoreFlushFinish();
+            signalFlushFinish = false;
+            // NOTE: this signal flag will be reset by the cudaGPU.
+        }
     } else {
         panic("Received unhandled packet type in control port");
     }
@@ -427,6 +442,13 @@
 }
 
 void
+CudaCore::beginCoreFlush()
+{
+    signalFlushFinish = true;
+    flush();
+}
+
+void
 CudaCore::finishKernel()
 {
     numKernelsCompleted++;
diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/gpgpu-sim/cuda_core.hh
--- a/src/gpu/gpgpu-sim/cuda_core.hh	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/gpu/gpgpu-sim/cuda_core.hh	Wed Dec 02 17:08:48 2015 -0600
@@ -180,6 +180,15 @@
     // if true then need to signal GPGPU-Sim once cleanup is done
     bool signalKernelFinish;
 
+    // if true then need to signal the cudaGPU once the flush is finished
+    bool signalFlushFinish;
+
+    // The flush has finished on the core, but not others. Cleared by cudaGPU
+    bool flushFinished;
+
+    // if true, do not accept any new memory requests from the shader cores
+    bool memoryPaused;
+
     // Returns the line of the address, a
     Addr addrToLine(Addr a);
 
@@ -230,6 +239,12 @@
     // Handle an instruction port retry request
     void handleRetry();
 
+    /**
+     * Flush the core of all pending instructions,
+     * This is currently used to force the LSQ to flush on kernel end
+     */
+    void flush();
+
   public:
     // Receive and complete an instruction fetch
     void recvInstResp(PacketPtr pkt);
@@ -263,10 +278,25 @@
     void writebackClear();
 
     /**
-     * Flush the core of all pending instructions,
-     * This is currently used to force the LSQ to flush on kernel end
+     * Called from the cudaGPU when flushing all of the GPU state
      */
-    void flush();
+    void beginCoreFlush();
+
+    /**
+     * Return whether or not the flush is finished
+     */
+    bool checkFlushFinish() { return flushFinished; }
+
+    /**
+     * Clear the flushFinished flag. Called from cudaGPU.
+     */
+    void clearFlushFinish() { flushFinished = false; }
+
+    /**
+     * (Un)Pause all accesses to memory. Currently used during TLB shootdown
+     */
+    void pauseMemory() { memoryPaused = true; }
+    void unpauseMemory() { memoryPaused = false; }
 
     /**
      * Called from GPGPU-Sim when a kernel completes on this shader
diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/gpgpu-sim/cuda_gpu.cc
--- a/src/gpu/gpgpu-sim/cuda_gpu.cc	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/gpu/gpgpu-sim/cuda_gpu.cc	Wed Dec 02 17:08:48 2015 -0600
@@ -62,13 +62,15 @@
 
 CudaGPU::CudaGPU(const Params *p) :
     ClockedObject(p), _params(p), gpuTickEvent(this, false), streamTickEvent(this, true),
+    tlbShootdownEvent(this),
     system(p->sys), warpSize(p->warp_size), sharedMemDelay(p->shared_mem_delay),
     gpgpusimConfigPath(p->config_path), launchDelay(p->kernel_launch_delay),
     returnDelay(p->kernel_return_delay), unblockNeeded(false), ruby(p->ruby),
     runningTC(NULL), runningStream(NULL), runningTID(-1), clearTick(0),
     dumpKernelStats(p->dump_kernel_stats), pageTable(),
     manageGPUMemory(p->manage_gpu_memory),
-    gpuMemoryRange(p->gpu_memory_range), shaderMMU(p->shader_mmu)
+    gpuMemoryRange(p->gpu_memory_range), shaderMMU(p->shader_mmu),
+    testShootdown(p->test_tlb_shootdown), shootdownTick(p->tlb_shootdown_tick)
 {
     // Register this device as a CUDA-enabled GPU
     cudaDeviceID = registerCudaDevice(this);
@@ -239,6 +241,10 @@
         (*iter)->initialize();
     }
 
+    if (testShootdown) {
+        schedule(tlbShootdownEvent, shootdownTick);
+    }
+
     if (!restoring) {
         return;
     }
@@ -278,6 +284,11 @@
     clearTick = curTick();
 }
 
+void CudaGPU::registerTLB(ShaderTLB *tlb)
+{
+    shaderTLBs.push_back(tlb);
+}
+
 void CudaGPU::registerCudaCore(CudaCore *sc)
 {
     cudaCores.push_back(sc);
@@ -488,6 +499,26 @@
     endStreamOperation();
 }
 
+void CudaGPU::cudaCoreFlushFinish()
+{
+    DPRINTF(CudaGPU, "Shootdown: Got a finish flush.\n");
+    bool allDone = true;
+    for (auto it: cudaCores) {
+        if (!it->checkFlushFinish()) {
+            DPRINTF(CudaGPU, "Shootdown: Not done yet...\n");
+            allDone = false;
+            break;
+        }
+    }
+    if (allDone) {
+        DPRINTF(CudaGPU, "Finally done!\n");
+        for (auto it: cudaCores) {
+            it->clearFlushFinish();
+        }
+        schedule(tlbShootdownEvent, nextCycle());
+    }
+}
+
 // TODO: When we move the stream manager into libcuda, this will need to be
 // eliminated, and libcuda will have to decide when to block the calling thread
 bool CudaGPU::needsToBlock()
@@ -722,6 +753,56 @@
         ;
 }
 
+void CudaGPU::TLBShootdownEvent::process()
+{
+    DPRINTF(CudaGPU, "Processing shootdown!\n");
+
+    switch(stage) {
+    case Stage::Idle:
+        DPRINTF(CudaGPU, "Shootdown: Pausing memory\n");
+        for (auto it: gpu->cudaCores) {
+            it->pauseMemory();
+        }
+        stage = Stage::Pausing;
+        gpu->schedule(this, gpu->nextCycle());
+        break;
+    case Stage::Pausing:
+        DPRINTF(CudaGPU, "Shootdown: Flushing cores\n");
+        for (auto it: gpu->cudaCores) {
+            it->beginCoreFlush();
+        }
+        stage = Stage::FlushingL1s;
+        break;
+    case Stage::FlushingL1s:
+        DPRINTF(CudaGPU, "Shootdown: DONE flushing cores\n");
+        gpu->schedule(this, gpu->nextCycle());
+        stage = Stage::FlushingOthers;
+        break;
+    case Stage::FlushingOthers:
+        DPRINTF(CudaGPU, "Shootdown: Flushing others\n");
+        DPRINTF(CudaGPU, "Shootdown: Flushing TLBs\n");
+        for (auto it: gpu->shaderTLBs) {
+            it->flushAll();
+        }
+
+        DPRINTF(CudaGPU, "Shootdown: Flushing MMU\n");
+        gpu->shaderMMU->flushAll();
+        gpu->schedule(this, gpu->clockEdge(Cycles(5)));
+        stage = Stage::Unpausing;
+        break;
+    case Stage::Unpausing:
+        DPRINTF(CudaGPU, "Shootdown: Unpausing\n");
+        for (auto it: gpu->cudaCores) {
+            it->unpauseMemory();
+        }
+        stage = Stage::Idle;
+        // NO need to schedule anything
+        break;
+    default:
+        panic("Unexpected current shootdown stage");
+    }
+}
+
 /**
 * virtual process function that is invoked when the callback
 * queue is executed.
diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/gpgpu-sim/cuda_gpu.hh
--- a/src/gpu/gpgpu-sim/cuda_gpu.hh	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/gpu/gpgpu-sim/cuda_gpu.hh	Wed Dec 02 17:08:48 2015 -0600
@@ -38,6 +38,7 @@
 #include "base/callback.hh"
 #include "debug/CudaGPUPageTable.hh"
 #include "gpgpu-sim/gpu-sim.h"
+#include "gpu/shader_tlb.hh"
 #include "params/CudaGPU.hh"
 #include "sim/process.hh"
 #include "sim/system.hh"
@@ -162,6 +163,21 @@
         }
     };
 
+    class TLBShootdownEvent : public Event
+    {
+
+    private:
+        CudaGPU *gpu;
+        enum class Stage {Idle, Pausing, FlushingL1s, FlushingOthers,
+                          Unpausing};
+        Stage stage;
+
+    public:
+        TLBShootdownEvent(CudaGPU *_gpu) : gpu(_gpu), stage(Stage::Idle) {}
+        void process();
+    };
+    friend class TLBShootdownEvent;
+
     const CudaGPUParams *_params;
     const Params * params() const { return dynamic_cast<const Params *>(_params); }
 
@@ -171,6 +187,8 @@
     /// Tick for when the stream manager needs execute
     TickEvent streamTickEvent;
 
+    TLBShootdownEvent tlbShootdownEvent;
+
   private:
     // The CUDA device ID for this GPU
     unsigned cudaDeviceID;
@@ -212,6 +230,9 @@
     /// Holds all of the CUDA cores in this GPU
     std::vector<CudaCore*> cudaCores;
 
+    /// Holds all of the GPU shader private TLBs
+    std::vector<ShaderTLB*> shaderTLBs;
+
     /// The thread context, stream and thread ID currently running on the SPA
     ThreadContext *runningTC;
     struct CUstream_st *runningStream;
@@ -324,6 +345,9 @@
 
     CudaDeviceProperties deviceProperties;
 
+    bool testShootdown;
+    Tick shootdownTick;
+
   public:
     /// Constructor
     CudaGPU(const Params *p);
@@ -338,6 +362,7 @@
     /// Register devices callbacks
     void registerCudaCore(CudaCore *sc);
     void registerCopyEngine(GPUCopyEngine *ce);
+    void registerTLB(ShaderTLB *tlb);
 
     /// Getter for whether we are using Ruby or GPGPU-Sim memory modeling
     CudaDeviceProperties *getDeviceProperties() { return &deviceProperties; }
@@ -400,6 +425,9 @@
     /// Called by the copy engine when a memcpy or memset is complete
     void finishCopyOperation();
 
+    /// Called by the cuda cores when they finish flushing if signalFlush is set
+    void cudaCoreFlushFinish();
+
     /// Called from shader TLB to be used for TLB lookups
     /// TODO: Move the thread context handling to GPU context when we get there
     ThreadContext *getThreadContext() { return runningTC; }
diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/shader_mmu.cc
--- a/src/gpu/shader_mmu.cc	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/gpu/shader_mmu.cc	Wed Dec 02 17:08:48 2015 -0600
@@ -484,6 +484,18 @@
 }
 
 void
+ShaderMMU::flushAll()
+{
+    assert(pendingWalks.empty());
+    assert(outstandingWalks.empty());
+    assert(pendingFaults.empty());
+    if (tlb != nullptr) {
+        tlb->flushAll();
+    }
+    prefetchBuffer.clear();
+}
+
+void
 ShaderMMU::regStats()
 {
     numPagefaults
diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/shader_mmu.hh
--- a/src/gpu/shader_mmu.hh	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/gpu/shader_mmu.hh	Wed Dec 02 17:08:48 2015 -0600
@@ -172,6 +172,9 @@
     /// Handle a page fault once it's done (called from CUDA API via CudaGPU)
     void handleFinishPageFault(ThreadContext *tc);
 
+    /// Flush any TLBs and others that needs to be flushed for TLB shootdown
+    void flushAll();
+
     void regStats();
 
     Stats::Scalar numPagefaults;
diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/shader_tlb.cc
--- a/src/gpu/shader_tlb.cc	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/gpu/shader_tlb.cc	Wed Dec 02 17:08:48 2015 -0600
@@ -53,6 +53,8 @@
         tlbMemory = new InfiniteTLBMemory();
     }
     mmu = cudaGPU->getMMU();
+
+    cudaGPU->registerTLB(this);
 }
 
 void
@@ -165,7 +167,7 @@
 void
 ShaderTLB::flushAll()
 {
-    panic("Flush all unimplemented");
+    tlbMemory->flushAll();
 }
 
 bool
@@ -218,6 +220,18 @@
 }
 
 void
+TLBMemory::flushAll()
+{
+    for (int i=0; i < ways; i++) {
+        for (int j=0; j<sets; j++) {
+            // need to update the hits per entry stat here, if they were
+            // being tracked correctly above...
+            entries[i][j].free = true;
+        }
+    }
+}
+
+void
 ShaderTLB::regStats()
 {
     hits
diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/shader_tlb.hh
--- a/src/gpu/shader_tlb.hh	Wed Dec 02 17:08:47 2015 -0600
+++ b/src/gpu/shader_tlb.hh	Wed Dec 02 17:08:48 2015 -0600
@@ -54,6 +54,7 @@
 public:
     virtual bool lookup(Addr vpn, Addr& ppn, bool set_mru=true) = 0;
     virtual void insert(Addr vpn, Addr ppn) = 0;
+    virtual void flushAll() = 0;
     virtual ~BaseTLBMemory() {}
 };
 
@@ -91,6 +92,7 @@
 
     virtual bool lookup(Addr vpn, Addr& ppn, bool set_mru=true);
     virtual void insert(Addr vpn, Addr ppn);
+    virtual void flushAll();
 };
 
 class InfiniteTLBMemory : public BaseTLBMemory {
@@ -114,6 +116,10 @@
     {
         entries[vpn] = ppn;
     }
+    void flushAll()
+    {
+        entries.clear();
+    }
 };
 
 class ShaderTLB : public BaseTLB
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449097728 21600
# Node ID 7b001aa001f007c6af4ab6ddfcf2f3b491b108d9
# Parent  3ee9d80f490fad33e0fa9c18fdba958942d3c63b
This is adding slicc files to implement the BCU, but it seems like the wrong way to go

diff -r 3ee9d80f490f -r 7b001aa001f0 configs/gpu_protocol/MOESI_hammer_bcu.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/configs/gpu_protocol/MOESI_hammer_bcu.py	Wed Dec 02 17:08:48 2015 -0600
@@ -0,0 +1,2 @@
+# Almost empty file to trick ruby into working
+from MOESI_hammer import *
\ No newline at end of file
diff -r 3ee9d80f490f -r 7b001aa001f0 configs/gpu_protocol/MOESI_hammer_bcu_fusion.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/configs/gpu_protocol/MOESI_hammer_bcu_fusion.py	Wed Dec 02 17:08:48 2015 -0600
@@ -0,0 +1,258 @@
+# Copyright (c) 2006-2007 The Regents of The University of Michigan
+# Copyright (c) 2009 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Brad Beckmann
+
+import math
+import m5
+from m5.objects import *
+from m5.defines import buildEnv
+from Ruby import create_topology
+
+#
+# Note: the L1 Cache latency is only used by the sequencer on fast path hits
+#
+class L1Cache(RubyCache):
+    latency = 1
+
+#
+# Note: the L2 Cache latency is not currently used
+#
+class L2Cache(RubyCache):
+    latency = 10
+
+def create_system(options, system, dma_ports, ruby_system):
+
+    if not buildEnv['GPGPU_SIM']:
+        m5.util.panic("This script requires GPGPU-Sim integration to be built.")
+
+    print "Creating system for GPU"
+
+    # Run the original protocol script
+    buildEnv['PROTOCOL'] = buildEnv['PROTOCOL'][:-11]
+    protocol = buildEnv['PROTOCOL']
+    exec "import %s" % protocol
+    try:
+        (cpu_sequencers, dir_cntrl_nodes, topology) = \
+            eval("%s.create_system(options, system, dma_ports, ruby_system)" % protocol)
+    except:
+        print "Error: could not create system for ruby protocol inside fusion system %s" % protocol
+        raise
+
+    #
+    # Must create the individual controllers before the network to ensure the
+    # controller constructors are called before the network constructor
+    #
+    block_size_bits = int(math.log(options.cacheline_size, 2))
+
+    cntrl_count = 0
+
+    for i in xrange(options.num_sc):
+        #
+        # First create the Ruby objects associated with this cpu
+        #
+        l1i_cache = L1Cache(size = options.l1i_size,
+                            assoc = options.l1i_assoc,
+                            start_index_bit = block_size_bits,
+                            is_icache = True)
+        l1d_cache = L1Cache(size = options.l1d_size,
+                            assoc = options.l1d_assoc,
+                            start_index_bit = block_size_bits)
+        l2_cache = L2Cache(size = options.l2_size,
+                           assoc = options.l2_assoc,
+                           start_index_bit = block_size_bits)
+
+        l1_cntrl = L1Cache_Controller(version = options.num_cpus+i,
+                                      L1Icache = l1i_cache,
+                                      L1Dcache = l1d_cache,
+                                      L2cache = l2_cache,
+                                      no_mig_atomic = not \
+                                        options.allow_atomic_migration,
+                                      send_evictions = (
+                                          options.cpu_type == "detailed"),
+                                      transitions_per_cycle = options.ports,
+                                      #clk_domain=system.cpu[i].clk_domain,
+                                      is_gpu = True,
+                                      ruby_system = ruby_system)
+
+        cpu_seq = RubySequencer(version = options.num_cpus + i,
+                                icache = l1i_cache,
+                                dcache = l1d_cache,
+                                access_phys_mem = True,
+                                max_outstanding_requests = options.gpu_l1_buf_depth,
+                                #clk_domain=system.cpu[i].clk_domain,
+                                ruby_system = ruby_system,
+                                connect_to_io = False)
+
+        l1_cntrl.sequencer = cpu_seq
+        if options.recycle_latency:
+            l1_cntrl.recycle_latency = options.recycle_latency
+
+        exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i)
+
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        cpu_sequencers.append(cpu_seq)
+        topology.addController(l1_cntrl)
+
+        # Connect the L1 controller and the network
+        # Connect the buffers from the controller to network
+        l1_cntrl.requestFromCache = ruby_system.network.slave
+        l1_cntrl.responseFromCache = ruby_system.network.slave
+        l1_cntrl.unblockFromCache = ruby_system.network.slave
+
+        # Connect the buffers from the network to the controller
+        l1_cntrl.forwardToCache = ruby_system.network.master
+        l1_cntrl.responseToCache = ruby_system.network.master
+
+        cntrl_count += 1
+
+    ############################################################################
+    # Pagewalk cache
+    # NOTE: We use a CPU L1 cache controller here. This is to facilatate MMU
+    #       cache coherence (as the GPU L1 caches are incoherent without flushes
+    #       The L2 cache is small, and should have minimal affect on the
+    #       performance (see Section 6.2 of Power et al. HPCA 2014).
+    pwd_cache = L1Cache(size = options.pwc_size,
+                            assoc = 16, # 64 is fully associative @ 8kB
+                            replacement_policy = "LRU",
+                            start_index_bit = block_size_bits,
+                            latency = 8,
+                            resourceStalls = False)
+    # Small cache since CPU L1 requires I and D
+    pwi_cache = L1Cache(size = "512B",
+                            assoc = 2,
+                            replacement_policy = "LRU",
+                            start_index_bit = block_size_bits,
+                            latency = 8,
+                            resourceStalls = False)
+    # Small cache since CPU L1 controller requires L2
+    l2_cache = L2Cache(size = "512B",
+                           assoc = 2,
+                           start_index_bit = block_size_bits,
+                           latency = 1,
+                           resourceStalls = False)
+
+    l1_cntrl = L1Cache_Controller(version = options.num_cpus + options.num_sc,
+                                  L1Icache = pwi_cache,
+                                  L1Dcache = pwd_cache,
+                                  L2cache = l2_cache,
+                                  send_evictions = False,
+                                  cache_response_latency = 1,
+                                  l2_cache_hit_latency = 1,
+                                  number_of_TBEs = options.gpu_l1_buf_depth,
+                                  ruby_system = ruby_system)
+
+    cpu_seq = RubySequencer(version = options.num_cpus + options.num_sc,
+                            icache = pwd_cache, # Never get data from pwi_cache
+                            dcache = pwd_cache,
+                            access_phys_mem = True,
+                            max_outstanding_requests = options.gpu_l1_buf_depth,
+                            ruby_system = ruby_system,
+                            deadlock_threshold = 2000000,
+                            connect_to_io = False)
+
+    l1_cntrl.sequencer = cpu_seq
+
+
+    ruby_system.l1_pw_cntrl = l1_cntrl
+    cpu_sequencers.append(cpu_seq)
+
+    topology.addController(l1_cntrl)
+
+    # Connect the L1 controller and the network
+    # Connect the buffers from the controller to network
+    l1_cntrl.requestFromCache = ruby_system.network.slave
+    l1_cntrl.responseFromCache = ruby_system.network.slave
+    l1_cntrl.unblockFromCache = ruby_system.network.slave
+
+    # Connect the buffers from the network to the controller
+    l1_cntrl.forwardToCache = ruby_system.network.master
+    l1_cntrl.responseToCache = ruby_system.network.master
+
+    # Copy engine cache (make as small as possible, ideally 0)
+    l1i_cache = L1Cache(size = "2kB", assoc = 2)
+    l1d_cache = L1Cache(size = "2kB", assoc = 2)
+    l2_cache = L2Cache(size = "2kB",
+                        assoc = 2,
+                        start_index_bit = block_size_bits)
+
+    l1_cntrl = L1Cache_Controller(version = options.num_cpus+options.num_sc+1,
+                                      L1Icache = l1i_cache,
+                                      L1Dcache = l1d_cache,
+                                      L2cache = l2_cache,
+                                      no_mig_atomic = not \
+                                        options.allow_atomic_migration,
+                                      send_evictions = (
+                                          options.cpu_type == "detailed"),
+                                      ruby_system = ruby_system)
+
+    #
+    # Only one unified L1 cache exists.  Can cache instructions and data.
+    #
+    cpu_seq = RubySequencer(version = options.num_cpus + options.num_sc + 1,
+                            icache = l1i_cache,
+                            dcache = l1d_cache,
+                            access_phys_mem = True,
+                            max_outstanding_requests = 64,
+                            ruby_system = ruby_system,
+                            connect_to_io = False)
+
+    l1_cntrl.sequencer = cpu_seq
+
+    ruby_system.l1_cntrl_ce = l1_cntrl
+
+    cpu_sequencers.append(cpu_seq)
+    topology.addController(l1_cntrl)
+
+    # Connect the L1 controller and the network
+    # Connect the buffers from the controller to network
+    l1_cntrl.requestFromCache = ruby_system.network.slave
+    l1_cntrl.responseFromCache = ruby_system.network.slave
+    l1_cntrl.unblockFromCache = ruby_system.network.slave
+
+    # Connect the buffers from the network to the controller
+    l1_cntrl.forwardToCache = ruby_system.network.master
+    l1_cntrl.responseToCache = ruby_system.network.master
+
+    # BCU
+    cntrl = BorderControlUnit_Controller(version = 0,
+                                         ruby_system = ruby_system)
+    ruby_system.bcu_cntrl = cntrl
+    topology.addController(cntrl)
+
+    cntrl.unblockToDirFromCache = ruby_system.network.master
+    cntrl.responseToDirFromCache = ruby_system.network.master
+    cntrl.requestToDirFromCache = ruby_system.network.master
+
+    # Connect the buffers from the network to the controller
+    cntrl.requestFromCacheToDir = ruby_system.network.slave
+    cntrl.responseFromCacheToDir = ruby_system.network.slave
+    cntrl.unblockFromCacheToDir = ruby_system.network.slave
+
+    return (cpu_sequencers, dir_cntrl_nodes, topology)
diff -r 3ee9d80f490f -r 7b001aa001f0 src/mem/protocol/MOESI_hammer-GPUcache.sm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/mem/protocol/MOESI_hammer-GPUcache.sm	Wed Dec 02 17:08:48 2015 -0600
@@ -0,0 +1,2212 @@
+/*
+ * Copyright (c) 1999-2013 Mark D. Hill and David A. Wood
+ * Copyright (c) 2009 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * AMD's contributions to the MOESI hammer protocol do not constitute an
+ * endorsement of its similarity to any AMD products.
+ *
+ * Authors: Milo Martin
+ *          Brad Beckmann
+ */
+
+machine({L1Cache, L2Cache}, "AMD Hammer-like protocol")
+    : Sequencer * sequencer;
+      CacheMemory * L1Icache;
+      CacheMemory * L1Dcache;
+      CacheMemory * L2cache;
+      Cycles cache_response_latency := 10;
+      Cycles issue_latency := 2;
+      Cycles l2_cache_hit_latency := 10;
+      bool is_gpu := "False";
+      bool no_mig_atomic := "True";
+      bool send_evictions;
+
+      // NETWORK BUFFERS
+      MessageBuffer * requestFromCache, network="To", virtual_network="2",
+            ordered="false", vnet_type="request";
+      MessageBuffer * responseFromCache, network="To", virtual_network="4",
+            ordered="false", vnet_type="response";
+      MessageBuffer * unblockFromCache, network="To", virtual_network="5",
+            ordered="false", vnet_type="unblock";
+
+      MessageBuffer * forwardToCache, network="From", virtual_network="3",
+            ordered="false", vnet_type="forward";
+      MessageBuffer * responseToCache, network="From", virtual_network="4",
+            ordered="false", vnet_type="response";
+
+{
+
+  // STATES
+  state_declaration(State, desc="Cache states", default="L1Cache_State_I") {
+    // Base states
+    I, AccessPermission:Invalid, desc="Idle";
+    S, AccessPermission:Read_Only, desc="Shared";
+    O, AccessPermission:Read_Only, desc="Owned";
+    M, AccessPermission:Read_Only, desc="Modified (dirty)";
+    MM, AccessPermission:Read_Write, desc="Modified (dirty and locally modified)";
+
+    // Base states, locked and ready to service the mandatory queue
+    IR, AccessPermission:Invalid, desc="Idle";
+    SR, AccessPermission:Read_Only, desc="Shared";
+    OR, AccessPermission:Read_Only, desc="Owned";
+    MR, AccessPermission:Read_Only, desc="Modified (dirty)";
+    MMR, AccessPermission:Read_Write, desc="Modified (dirty and locally modified)";
+
+    // Transient States
+    IM, AccessPermission:Busy, "IM", desc="Issued GetX";
+    SM, AccessPermission:Read_Only, "SM", desc="Issued GetX, we still have a valid copy of the line";
+    OM, AccessPermission:Read_Only, "OM", desc="Issued GetX, received data";
+    ISM, AccessPermission:Read_Only, "ISM", desc="Issued GetX, received valid data, waiting for all acks";
+    M_W, AccessPermission:Read_Only, "M^W", desc="Issued GetS, received exclusive data";
+    MM_W, AccessPermission:Read_Write, "MM^W", desc="Issued GetX, received exclusive data";
+    IS, AccessPermission:Busy, "IS", desc="Issued GetS";
+    SS, AccessPermission:Read_Only, "SS", desc="Issued GetS, received data, waiting for all acks";
+    OI, AccessPermission:Busy, "OI", desc="Issued PutO, waiting for ack";
+    MI, AccessPermission:Busy, "MI", desc="Issued PutX, waiting for ack";
+    II, AccessPermission:Busy, "II", desc="Issued PutX/O, saw Other_GETS or Other_GETX, waiting for ack";
+    IT, AccessPermission:Busy, "IT", desc="Invalid block transferring to L1";
+    ST, AccessPermission:Busy, "ST", desc="S block transferring to L1";
+    OT, AccessPermission:Busy, "OT", desc="O block transferring to L1";
+    MT, AccessPermission:Busy, "MT", desc="M block transferring to L1";
+    MMT, AccessPermission:Busy, "MMT", desc="MM block transferring to L0";
+
+    //Transition States Related to Flushing
+    MI_F, AccessPermission:Busy, "MI_F", desc="Issued PutX due to a Flush, waiting for ack";
+    MM_F, AccessPermission:Busy, "MM_F", desc="Issued GETF due to a Flush, waiting for ack";
+    IM_F, AccessPermission:Busy, "IM_F", desc="Issued GetX due to a Flush";
+    ISM_F, AccessPermission:Read_Only, "ISM_F", desc="Issued GetX, received data, waiting for all acks";
+    SM_F, AccessPermission:Read_Only, "SM_F", desc="Issued GetX, we still have an old copy of the line";
+    OM_F, AccessPermission:Read_Only, "OM_F", desc="Issued GetX, received data";
+    MM_WF, AccessPermission:Busy, "MM_WF", desc="Issued GetX, received exclusive data";
+  }
+
+  // EVENTS
+  enumeration(Event, desc="Cache events") {
+    Load,            desc="Load request from the processor";
+    Ifetch,          desc="I-fetch request from the processor";
+    Store,           desc="Store request from the processor";
+    L2_Replacement,  desc="L2 Replacement";
+    L1_to_L2,        desc="L1 to L2 transfer";
+    Trigger_L2_to_L1D,  desc="Trigger L2 to L1-Data transfer";
+    Trigger_L2_to_L1I,  desc="Trigger L2 to L1-Instruction transfer";
+    Complete_L2_to_L1, desc="L2 to L1 transfer completed";
+
+    // Requests
+    Other_GETX,      desc="A GetX from another processor";
+    Other_GETS,      desc="A GetS from another processor";
+    Merged_GETS,     desc="A Merged GetS from another processor";
+    Other_GETS_No_Mig, desc="A GetS from another processor";
+    NC_DMA_GETS,     desc="special GetS when only DMA exists";
+    Invalidate,      desc="Invalidate block";
+
+    // Responses
+    Ack,             desc="Received an ack message";
+    Shared_Ack,      desc="Received an ack message, responder has a shared copy";
+    Data,            desc="Received a data message";
+    Shared_Data,     desc="Received a data message, responder has a shared copy";
+    Exclusive_Data,  desc="Received a data message, responder had an exclusive copy, they gave it to us";
+
+    Writeback_Ack,   desc="Writeback O.K. from directory";
+    Writeback_Nack,  desc="Writeback not O.K. from directory";
+
+    // Triggers
+    All_acks,                  desc="Received all required data and message acks";
+    All_acks_no_sharers,        desc="Received all acks and no other processor has a shared copy";
+
+    // For Flush
+    Flush_line,                  desc="flush the cache line from all caches";
+    Block_Ack,                   desc="the directory is blocked and ready for the flush";
+  }
+
+  // TYPES
+
+  // STRUCTURE DEFINITIONS
+
+  MessageBuffer mandatoryQueue, ordered="false";
+
+  // CacheEntry
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,        desc="cache state";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    DataBlock DataBlk,       desc="data for the block";
+    bool FromL2, default="false", desc="block just moved from L2";
+    bool AtomicAccessed, default="false", desc="block just moved from L2";
+  }
+
+  // TBE fields
+  structure(TBE, desc="...") {
+    State TBEState,          desc="Transient state";
+    DataBlock DataBlk,       desc="data for the block, required for concurrent writebacks";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,      desc="Number of acks/data messages that this processor is waiting for";
+    bool Sharers,            desc="On a GetS, did we find any other sharers in the system";
+    bool AppliedSilentAcks, default="false", desc="for full-bit dir, does the pending msg count reflect the silent acks";
+    MachineID LastResponder, desc="last machine to send a response for this request";
+    MachineID CurOwner,      desc="current owner of the block, used for UnblockS responses";
+
+    Cycles InitialRequestTime, default="Cycles(0)",
+            desc="time the initial requests was sent from the L1Cache";
+    Cycles ForwardRequestTime, default="Cycles(0)",
+            desc="time the dir forwarded the request";
+    Cycles FirstResponseTime, default="Cycles(0)",
+            desc="the time the first response was received";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Address);
+    void allocate(Address);
+    void deallocate(Address);
+    bool isPresent(Address);
+  }
+
+  TBETable TBEs, template="<L1Cache_TBE>", constructor="m_number_of_TBEs";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Address a);
+  Cycles curCycle();
+
+  Entry getCacheEntry(Address address), return_by_pointer="yes" {
+    Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address));
+    if(is_valid(L2cache_entry)) {
+      return L2cache_entry;
+    }
+
+    Entry L1Dcache_entry := static_cast(Entry, "pointer", L1Dcache.lookup(address));
+    if(is_valid(L1Dcache_entry)) {
+      return L1Dcache_entry;
+    }
+
+    Entry L1Icache_entry := static_cast(Entry, "pointer", L1Icache.lookup(address));
+    return L1Icache_entry;
+  }
+
+  DataBlock getDataBlock(Address addr), return_by_ref="yes" {
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+        return cache_entry.DataBlk;
+    }
+
+    TBE tbe := TBEs[addr];
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    }
+
+    error("Missing data block");
+  }
+
+  Entry getL2CacheEntry(Address address), return_by_pointer="yes" {
+    Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address));
+    return L2cache_entry;
+  }
+
+  Entry getL1DCacheEntry(Address address), return_by_pointer="yes" {
+    Entry L1Dcache_entry := static_cast(Entry, "pointer", L1Dcache.lookup(address));
+    return L1Dcache_entry;
+  }
+
+  Entry getL1ICacheEntry(Address address), return_by_pointer="yes" {
+    Entry L1Icache_entry := static_cast(Entry, "pointer", L1Icache.lookup(address));
+    return L1Icache_entry;
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Address addr) {
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Address addr, State state) {
+    assert((L1Dcache.isTagPresent(addr) && L1Icache.isTagPresent(addr)) == false);
+    assert((L1Icache.isTagPresent(addr) && L2cache.isTagPresent(addr)) == false);
+    assert((L1Dcache.isTagPresent(addr) && L2cache.isTagPresent(addr)) == false);
+
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Address addr) {
+    TBE tbe := TBEs[addr];
+    if(is_valid(tbe)) {
+      return L1Cache_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return L1Cache_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Address addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(L1Cache_State_to_permission(state));
+    }
+  }
+
+  Event mandatory_request_type_to_event(RubyRequestType type) {
+    if (type == RubyRequestType:LD) {
+      return Event:Load;
+    } else if (type == RubyRequestType:IFETCH) {
+      return Event:Ifetch;
+    } else if ((type == RubyRequestType:ST) || (type == RubyRequestType:ATOMIC)) {
+      return Event:Store;
+    } else if ((type == RubyRequestType:FLUSH)) {
+      return Event:Flush_line;
+    } else {
+      error("Invalid RubyRequestType");
+    }
+  }
+
+  MachineType testAndClearLocalHit(Entry cache_entry) {
+    if (is_valid(cache_entry) && cache_entry.FromL2) {
+      cache_entry.FromL2 := false;
+      return MachineType:L2Cache;
+    }
+    return MachineType:L1Cache;
+  }
+
+  bool IsAtomicAccessed(Entry cache_entry) {
+    assert(is_valid(cache_entry));
+    return cache_entry.AtomicAccessed;
+  }
+
+  MessageBuffer triggerQueue, ordered="false";
+
+  // ** OUT_PORTS **
+
+  out_port(requestNetwork_out, RequestMsg, requestFromCache);
+  out_port(responseNetwork_out, ResponseMsg, responseFromCache);
+  out_port(unblockNetwork_out, ResponseMsg, unblockFromCache);
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+
+  // ** IN_PORTS **
+
+  // Trigger Queue
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=3) {
+    if (triggerQueue_in.isReady()) {
+      peek(triggerQueue_in, TriggerMsg) {
+
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        TBE tbe := TBEs[in_msg.Addr];
+
+        if (in_msg.Type == TriggerType:L2_to_L1) {
+          trigger(Event:Complete_L2_to_L1, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == TriggerType:ALL_ACKS) {
+          trigger(Event:All_acks, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == TriggerType:ALL_ACKS_NO_SHARERS) {
+          trigger(Event:All_acks_no_sharers, in_msg.Addr, cache_entry, tbe);
+        } else {
+          error("Unexpected message");
+        }
+      }
+    }
+  }
+
+  // Nothing from the unblock network
+
+  // Response Network
+  in_port(responseToCache_in, ResponseMsg, responseToCache, rank=2) {
+    if (responseToCache_in.isReady()) {
+      peek(responseToCache_in, ResponseMsg, block_on="Addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        TBE tbe := TBEs[in_msg.Addr];
+
+        if (in_msg.Type == CoherenceResponseType:ACK) {
+          trigger(Event:Ack, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:ACK_SHARED) {
+          trigger(Event:Shared_Ack, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:DATA) {
+          trigger(Event:Data, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:DATA_SHARED) {
+          trigger(Event:Shared_Data, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE) {
+          trigger(Event:Exclusive_Data, in_msg.Addr, cache_entry, tbe);
+        } else {
+          error("Unexpected message");
+        }
+      }
+    }
+  }
+
+  // Forward Network
+  in_port(forwardToCache_in, RequestMsg, forwardToCache, rank=1) {
+    if (forwardToCache_in.isReady()) {
+      peek(forwardToCache_in, RequestMsg, block_on="Addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        TBE tbe := TBEs[in_msg.Addr];
+
+        if ((in_msg.Type == CoherenceRequestType:GETX) ||
+            (in_msg.Type == CoherenceRequestType:GETF)) {
+          trigger(Event:Other_GETX, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:MERGED_GETS) {
+          trigger(Event:Merged_GETS, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:GETS) {
+          if (machineCount(MachineType:L1Cache) > 1) {
+            if (is_valid(cache_entry)) {
+              if (IsAtomicAccessed(cache_entry) && no_mig_atomic) {
+                trigger(Event:Other_GETS_No_Mig, in_msg.Addr, cache_entry, tbe);
+              } else {
+                trigger(Event:Other_GETS, in_msg.Addr, cache_entry, tbe);
+              }
+            } else {
+              trigger(Event:Other_GETS, in_msg.Addr, cache_entry, tbe);
+            }
+          } else {
+            trigger(Event:NC_DMA_GETS, in_msg.Addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceRequestType:INV) {
+          trigger(Event:Invalidate, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:WB_ACK) {
+          trigger(Event:Writeback_Ack, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:WB_NACK) {
+          trigger(Event:Writeback_Nack, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:BLOCK_ACK) {
+          trigger(Event:Block_Ack, in_msg.Addr, cache_entry, tbe);
+        } else {
+          error("Unexpected message");
+        }
+      }
+    }
+  }
+
+  // Nothing from the request network
+
+  // Mandatory Queue
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...", rank=0) {
+    if (mandatoryQueue_in.isReady()) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+
+        // Check for data access to blocks in I-cache and ifetchs to blocks in D-cache
+        TBE tbe := TBEs[in_msg.LineAddress];
+
+        if (in_msg.Type == RubyRequestType:IFETCH) {
+          // ** INSTRUCTION ACCESS ***
+
+          Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
+          if (is_valid(L1Icache_entry)) {
+            // The tag matches for the L1, so the L1 fetches the line.
+            // We know it can't be in the L2 due to exclusion
+            trigger(mandatory_request_type_to_event(in_msg.Type),
+                    in_msg.LineAddress, L1Icache_entry, tbe);
+          } else {
+            // Check to see if it is in the OTHER L1
+            Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
+            if (is_valid(L1Dcache_entry)) {
+              // The block is in the wrong L1, try to write it to the L2
+              if (L2cache.cacheAvail(in_msg.LineAddress)) {
+                trigger(Event:L1_to_L2, in_msg.LineAddress, L1Dcache_entry, tbe);
+              } else {
+                Address l2_victim_addr := L2cache.cacheProbe(in_msg.LineAddress);
+                trigger(Event:L2_Replacement,
+                        l2_victim_addr,
+                        getL2CacheEntry(l2_victim_addr),
+                        TBEs[l2_victim_addr]);
+              }
+            }
+
+            if (L1Icache.cacheAvail(in_msg.LineAddress)) {
+              // L1 does't have the line, but we have space for it in the L1
+
+              Entry L2cache_entry := getL2CacheEntry(in_msg.LineAddress);
+              if (is_valid(L2cache_entry)) {
+                // L2 has it (maybe not with the right permissions)
+                trigger(Event:Trigger_L2_to_L1I, in_msg.LineAddress,
+                        L2cache_entry, tbe);
+              } else {
+                // We have room, the L2 doesn't have it, so the L1 fetches the line
+                trigger(mandatory_request_type_to_event(in_msg.Type),
+                        in_msg.LineAddress, L1Icache_entry, tbe);
+              }
+            } else {
+              // No room in the L1, so we need to make room
+              Address l1i_victim_addr := L1Icache.cacheProbe(in_msg.LineAddress);
+              if (L2cache.cacheAvail(l1i_victim_addr)) {
+                // The L2 has room, so we move the line from the L1 to the L2
+                trigger(Event:L1_to_L2,
+                        l1i_victim_addr,
+                        getL1ICacheEntry(l1i_victim_addr),
+                        TBEs[l1i_victim_addr]);
+              } else {
+                Address l2_victim_addr := L2cache.cacheProbe(l1i_victim_addr);
+                // The L2 does not have room, so we replace a line from the L2
+                trigger(Event:L2_Replacement,
+                        l2_victim_addr,
+                        getL2CacheEntry(l2_victim_addr),
+                        TBEs[l2_victim_addr]);
+              }
+            }
+          }
+        } else {
+          // *** DATA ACCESS ***
+
+          Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
+          if (is_valid(L1Dcache_entry)) {
+            // The tag matches for the L1, so the L1 fetches the line.
+            // We know it can't be in the L2 due to exclusion
+            trigger(mandatory_request_type_to_event(in_msg.Type),
+                    in_msg.LineAddress, L1Dcache_entry, tbe);
+          } else {
+
+            // Check to see if it is in the OTHER L1
+            Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
+            if (is_valid(L1Icache_entry)) {
+              // The block is in the wrong L1, try to write it to the L2
+              if (L2cache.cacheAvail(in_msg.LineAddress)) {
+                trigger(Event:L1_to_L2, in_msg.LineAddress, L1Icache_entry, tbe);
+              } else {
+                Address l2_victim_addr := L2cache.cacheProbe(in_msg.LineAddress);
+                trigger(Event:L2_Replacement,
+                        l2_victim_addr,
+                        getL2CacheEntry(l2_victim_addr),
+                        TBEs[l2_victim_addr]);
+              }
+            }
+
+            if (L1Dcache.cacheAvail(in_msg.LineAddress)) {
+              // L1 does't have the line, but we have space for it in the L1
+              Entry L2cache_entry := getL2CacheEntry(in_msg.LineAddress);
+              if (is_valid(L2cache_entry)) {
+                // L2 has it (maybe not with the right permissions)
+                trigger(Event:Trigger_L2_to_L1D, in_msg.LineAddress,
+                        L2cache_entry, tbe);
+              } else {
+                // We have room, the L2 doesn't have it, so the L1 fetches the line
+                trigger(mandatory_request_type_to_event(in_msg.Type),
+                        in_msg.LineAddress, L1Dcache_entry, tbe);
+              }
+            } else {
+              // No room in the L1, so we need to make room
+              Address l1d_victim_addr := L1Dcache.cacheProbe(in_msg.LineAddress);
+              if (L2cache.cacheAvail(l1d_victim_addr)) {
+                // The L2 has room, so we move the line from the L1 to the L2
+                trigger(Event:L1_to_L2,
+                        l1d_victim_addr,
+                        getL1DCacheEntry(l1d_victim_addr),
+                        TBEs[l1d_victim_addr]);
+              } else {
+                Address l2_victim_addr := L2cache.cacheProbe(l1d_victim_addr);
+                // The L2 does not have room, so we replace a line from the L2
+                trigger(Event:L2_Replacement,
+                        l2_victim_addr,
+                        getL2CacheEntry(l2_victim_addr),
+                        TBEs[l2_victim_addr]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ACTIONS
+
+  action(a_issueGETS, "a", desc="Issue GETS") {
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
+      assert(is_valid(tbe));
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceRequestType:GETS;
+      out_msg.Requestor := machineID;
+      if (is_gpu) {
+          DPRINTF(RubySlicc, "Setting up the broadcast\n");
+        out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      } else {
+        out_msg.Destination.add(map_Address_to_Directory(address));
+      }
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+
+      // One from each other cache (n-1) plus the memory (+1)
+      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache);
+    }
+  }
+
+  action(b_issueGETX, "b", desc="Issue GETX") {
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
+      assert(is_valid(tbe));
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceRequestType:GETX;
+      out_msg.Requestor := machineID;
+      if (is_gpu) {
+          DPRINTF(RubySlicc, "Setting up the broadcast\n");
+        out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      } else {
+        out_msg.Destination.add(map_Address_to_Directory(address));
+      }
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+
+      // One from each other cache (n-1) plus the memory (+1)
+      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache);
+    }
+  }
+
+  action(b_issueGETXIfMoreThanOne, "bo", desc="Issue GETX") {
+    if (machineCount(MachineType:L1Cache) > 1) {
+      enqueue(requestNetwork_out, RequestMsg, issue_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceRequestType:GETX;
+        out_msg.Requestor := machineID;
+        if (is_gpu) {
+          DPRINTF(RubySlicc, "Setting up the broadcast\n");
+          out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+        } else {
+          out_msg.Destination.add(map_Address_to_Directory(address));
+        }
+        out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.InitialRequestTime := curCycle();
+      }
+    }
+
+    // One from each other cache (n-1) plus the memory (+1)
+    tbe.NumPendingMsgs := machineCount(MachineType:L1Cache);
+  }
+
+  action(bf_issueGETF, "bf", desc="Issue GETF") {
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
+      assert(is_valid(tbe));
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceRequestType:GETF;
+      out_msg.Requestor := machineID;
+      if (is_gpu) {
+        DPRINTF(RubySlicc, "Setting up the broadcast\n");
+        out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      } else {
+        out_msg.Destination.add(map_Address_to_Directory(address));
+      }
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+
+      // One from each other cache (n-1) plus the memory (+1)
+      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache);
+    }
+  }
+
+  action(c_sendExclusiveData, "c", desc="Send exclusive data from cache to requestor") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.OriginalDestination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.Dirty := cache_entry.Dirty;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(ct_sendExclusiveDataFromTBE, "ct", desc="Send exclusive data from tbe to requestor") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.OriginalDestination.add(in_msg.Requestor);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(d_issuePUT, "d", desc="Issue PUT") {
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceRequestType:PUT;
+      out_msg.Requestor := machineID;
+      if (is_gpu) {
+        DPRINTF(RubySlicc, "Setting up the broadcast\n");
+        out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      } else {
+        out_msg.Destination.add(map_Address_to_Directory(address));
+      }
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Writeback_Control;
+    }
+  }
+
+  action(df_issuePUTF, "df", desc="Issue PUTF") {
+    enqueue(requestNetwork_out, RequestMsg, issue_latency) {
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceRequestType:PUTF;
+      out_msg.Requestor := machineID;
+      if (is_gpu) {
+        DPRINTF(RubySlicc, "Setting up the broadcast\n");
+        out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      } else {
+        out_msg.Destination.add(map_Address_to_Directory(address));
+      }
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Writeback_Control;
+    }
+  }
+
+  action(e_sendData, "e", desc="Send data from cache to requestor") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.OriginalDestination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.Dirty := cache_entry.Dirty;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(ee_sendDataShared, "\e", desc="Send data from cache to requestor, remaining the owner") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.OriginalDestination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.Dirty := cache_entry.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk);
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(et_sendDataSharedFromTBE, "\et", desc="Send data from TBE to requestor, keep a shared copy") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.OriginalDestination.add(in_msg.Requestor);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk);
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(em_sendDataSharedMultiple, "em", desc="Send data from cache to all requestors, still the owner") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination := in_msg.MergedRequestors;
+        out_msg.OriginalDestination := in_msg.MergedRequestors;
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.Dirty := cache_entry.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk);
+        out_msg.Acks := machineCount(MachineType:L1Cache);
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(emt_sendDataSharedMultipleFromTBE, "emt", desc="Send data from tbe to all requestors") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination := in_msg.MergedRequestors;
+        out_msg.OriginalDestination := in_msg.MergedRequestors;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk);
+        out_msg.Acks := machineCount(MachineType:L1Cache);
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(f_sendAck, "f", desc="Send ack from cache to requestor") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:ACK;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.OriginalDestination.add(in_msg.Requestor);
+        out_msg.Acks := 1;
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        assert(in_msg.DirectedProbe == false);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(ff_sendAckShared, "\f", desc="Send shared ack from cache to requestor") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:ACK_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.OriginalDestination.add(in_msg.Requestor);
+        out_msg.Acks := 1;
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        assert(in_msg.DirectedProbe == false);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(g_sendUnblock, "g", desc="Send unblock to memory") {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceResponseType:UNBLOCK;
+      out_msg.Sender := machineID;
+      if (is_gpu) {
+        out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      } else {
+        out_msg.Destination.add(map_Address_to_Directory(address));
+      }
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+    }
+  }
+
+  action(gm_sendUnblockM, "gm", desc="Send unblock to memory and indicate M/O/E state") {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceResponseType:UNBLOCKM;
+      out_msg.Sender := machineID;
+      if (is_gpu) {
+        out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      } else {
+        out_msg.Destination.add(map_Address_to_Directory(address));
+      }
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+    }
+  }
+
+  action(gs_sendUnblockS, "gs", desc="Send unblock to memory and indicate S state") {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      assert(is_valid(tbe));
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceResponseType:UNBLOCKS;
+      out_msg.Sender := machineID;
+      out_msg.CurOwner := tbe.CurOwner;
+      if (is_gpu) {
+        out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      } else {
+        out_msg.Destination.add(map_Address_to_Directory(address));
+      }
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+    }
+  }
+
+  action(h_load_hit, "h", desc="Notify sequencer the load completed.") {
+    assert(is_valid(cache_entry));
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    sequencer.readCallback(address, cache_entry.DataBlk, false,
+                           testAndClearLocalHit(cache_entry));
+  }
+
+  action(hx_external_load_hit, "hx", desc="load required external msgs") {
+    assert(is_valid(cache_entry));
+    assert(is_valid(tbe));
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    peek(responseToCache_in, ResponseMsg) {
+
+      sequencer.readCallback(address, cache_entry.DataBlk, true,
+                 machineIDToMachineType(in_msg.Sender), tbe.InitialRequestTime,
+                 tbe.ForwardRequestTime, tbe.FirstResponseTime);
+    }
+  }
+
+  action(hh_store_hit, "\h", desc="Notify sequencer that store completed.") {
+    assert(is_valid(cache_entry));
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    peek(mandatoryQueue_in, RubyRequest) {
+      sequencer.writeCallback(address, cache_entry.DataBlk, false,
+                              testAndClearLocalHit(cache_entry));
+
+      cache_entry.Dirty := true;
+      if (in_msg.Type == RubyRequestType:ATOMIC) {
+        cache_entry.AtomicAccessed := true;
+      }
+    }
+  }
+
+  action(hh_flush_hit, "\hf", desc="Notify sequencer that flush completed.") {
+    assert(is_valid(tbe));
+    DPRINTF(RubySlicc, "%s\n", tbe.DataBlk);
+    sequencer.writeCallback(address, tbe.DataBlk, false, MachineType:L1Cache);
+  }
+
+  action(sx_external_store_hit, "sx", desc="store required external msgs.") {
+    assert(is_valid(cache_entry));
+    assert(is_valid(tbe));
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    peek(responseToCache_in, ResponseMsg) {
+
+      sequencer.writeCallback(address, cache_entry.DataBlk, true,
+              machineIDToMachineType(in_msg.Sender), tbe.InitialRequestTime,
+              tbe.ForwardRequestTime, tbe.FirstResponseTime);
+    }
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    cache_entry.Dirty := true;
+  }
+
+  action(sxt_trig_ext_store_hit, "sxt", desc="store required external msgs.") {
+    assert(is_valid(cache_entry));
+    assert(is_valid(tbe));
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+
+    sequencer.writeCallback(address, cache_entry.DataBlk, true,
+            machineIDToMachineType(tbe.LastResponder), tbe.InitialRequestTime,
+            tbe.ForwardRequestTime, tbe.FirstResponseTime);
+
+    cache_entry.Dirty := true;
+  }
+
+  action(i_allocateTBE, "i", desc="Allocate TBE") {
+    check_allocate(TBEs);
+    assert(is_valid(cache_entry));
+    TBEs.allocate(address);
+    set_tbe(TBEs[address]);
+    tbe.DataBlk := cache_entry.DataBlk; // Data only used for writebacks
+    tbe.Dirty := cache_entry.Dirty;
+    tbe.Sharers := false;
+  }
+
+  action(it_allocateTBE, "it", desc="Allocate TBE") {
+    check_allocate(TBEs);
+    TBEs.allocate(address);
+    set_tbe(TBEs[address]);
+    tbe.Dirty := false;
+    tbe.Sharers := false;
+  }
+
+  action(j_popTriggerQueue, "j", desc="Pop trigger queue.") {
+    triggerQueue_in.dequeue();
+  }
+
+  action(k_popMandatoryQueue, "k", desc="Pop mandatory queue.") {
+    mandatoryQueue_in.dequeue();
+  }
+
+  action(l_popForwardQueue, "l", desc="Pop forwareded request queue.") {
+    forwardToCache_in.dequeue();
+  }
+
+  action(hp_copyFromTBEToL2, "li", desc="Copy data from TBE to L2 cache entry.") {
+    assert(is_valid(cache_entry));
+    assert(is_valid(tbe));
+    cache_entry.Dirty   := tbe.Dirty;
+    cache_entry.DataBlk := tbe.DataBlk;
+  }
+
+  action(nb_copyFromTBEToL1, "fu", desc="Copy data from TBE to L1 cache entry.") {
+    assert(is_valid(cache_entry));
+    assert(is_valid(tbe));
+    cache_entry.Dirty   := tbe.Dirty;
+    cache_entry.DataBlk := tbe.DataBlk;
+    cache_entry.FromL2 := true;
+  }
+
+  action(m_decrementNumberOfMessages, "m", desc="Decrement the number of messages for which we're waiting") {
+    peek(responseToCache_in, ResponseMsg) {
+      assert(in_msg.Acks >= 0);
+      assert(is_valid(tbe));
+      DPRINTF(RubySlicc, "Sender = %s\n", in_msg.Sender);
+      DPRINTF(RubySlicc, "SilentAcks = %d\n", in_msg.SilentAcks);
+      if (tbe.AppliedSilentAcks == false) {
+        tbe.NumPendingMsgs := tbe.NumPendingMsgs - in_msg.SilentAcks;
+        tbe.AppliedSilentAcks := true;
+      }
+      DPRINTF(RubySlicc, "%d\n", tbe.NumPendingMsgs);
+      tbe.NumPendingMsgs := tbe.NumPendingMsgs - in_msg.Acks;
+      DPRINTF(RubySlicc, "%d\n", tbe.NumPendingMsgs);
+      APPEND_TRANSITION_COMMENT(tbe.NumPendingMsgs);
+      APPEND_TRANSITION_COMMENT(in_msg.Sender);
+      tbe.LastResponder := in_msg.Sender;
+      if (tbe.InitialRequestTime != zero_time() && in_msg.InitialRequestTime != zero_time()) {
+        assert(tbe.InitialRequestTime == in_msg.InitialRequestTime);
+      }
+      if (in_msg.InitialRequestTime != zero_time()) {
+        tbe.InitialRequestTime := in_msg.InitialRequestTime;
+      }
+      if (tbe.ForwardRequestTime != zero_time() && in_msg.ForwardRequestTime != zero_time()) {
+        assert(tbe.ForwardRequestTime == in_msg.ForwardRequestTime);
+      }
+      if (in_msg.ForwardRequestTime != zero_time()) {
+        tbe.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+      if (tbe.FirstResponseTime == zero_time()) {
+        tbe.FirstResponseTime := curCycle();
+      }
+    }
+  }
+  action(uo_updateCurrentOwner, "uo", desc="When moving SS state, update current owner.") {
+    peek(responseToCache_in, ResponseMsg) {
+      assert(is_valid(tbe));
+      tbe.CurOwner := in_msg.Sender;
+    }
+  }
+
+  action(n_popResponseQueue, "n", desc="Pop response queue") {
+    responseToCache_in.dequeue();
+  }
+
+  action(ll_L2toL1Transfer, "ll", desc="") {
+    enqueue(triggerQueue_out, TriggerMsg, l2_cache_hit_latency) {
+      out_msg.Addr := address;
+      out_msg.Type := TriggerType:L2_to_L1;
+    }
+  }
+
+  action(o_checkForCompletion, "o", desc="Check if we have received all the messages required for completion") {
+    assert(is_valid(tbe));
+    if (tbe.NumPendingMsgs == 0) {
+      enqueue(triggerQueue_out, TriggerMsg) {
+        out_msg.Addr := address;
+        if (tbe.Sharers) {
+          out_msg.Type := TriggerType:ALL_ACKS;
+        } else {
+          out_msg.Type := TriggerType:ALL_ACKS_NO_SHARERS;
+        }
+      }
+    }
+  }
+
+  action(p_decrementNumberOfMessagesByOne, "p", desc="Decrement the number of messages for which we're waiting by one") {
+    assert(is_valid(tbe));
+    tbe.NumPendingMsgs := tbe.NumPendingMsgs - 1;
+  }
+
+  action(pp_incrementNumberOfMessagesByOne, "\p", desc="Increment the number of messages for which we're waiting by one") {
+    assert(is_valid(tbe));
+    tbe.NumPendingMsgs := tbe.NumPendingMsgs + 1;
+  }
+
+  action(q_sendDataFromTBEToCache, "q", desc="Send data from TBE to cache") {
+    peek(forwardToCache_in, RequestMsg) {
+        assert(in_msg.Requestor != machineID);
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.OriginalDestination.add(in_msg.Requestor);
+        DPRINTF(RubySlicc, "%s\n", out_msg.Destination);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(sq_sendSharedDataFromTBEToCache, "sq", desc="Send shared data from TBE to cache, still the owner") {
+    peek(forwardToCache_in, RequestMsg) {
+        assert(in_msg.Requestor != machineID);
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.OriginalDestination.add(in_msg.Requestor);
+        DPRINTF(RubySlicc, "%s\n", out_msg.Destination);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(qm_sendDataFromTBEToCache, "qm", desc="Send data from TBE to cache, multiple sharers, still the owner") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination := in_msg.MergedRequestors;
+        out_msg.OriginalDestination := in_msg.MergedRequestors;
+        DPRINTF(RubySlicc, "%s\n", out_msg.Destination);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        out_msg.Acks := machineCount(MachineType:L1Cache);
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(qq_sendDataFromTBEToMemory, "\q", desc="Send data from TBE to memory") {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      assert(is_valid(tbe));
+      out_msg.Addr := address;
+      out_msg.Sender := machineID;
+      if (is_gpu) {
+        out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      } else {
+        out_msg.Destination.add(map_Address_to_Directory(address));
+      }
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Dirty) {
+        out_msg.Type := CoherenceResponseType:WB_DIRTY;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+      } else {
+        out_msg.Type := CoherenceResponseType:WB_CLEAN;
+        // NOTE: in a real system this would not send data.  We send
+        // data here only so we can check it at the memory
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(r_setSharerBit, "r", desc="We saw other sharers") {
+    assert(is_valid(tbe));
+    tbe.Sharers := true;
+  }
+
+  action(s_deallocateTBE, "s", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(t_sendExclusiveDataFromTBEToMemory, "t", desc="Send exclusive data from TBE to memory") {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      assert(is_valid(tbe));
+      out_msg.Addr := address;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Dirty) {
+        out_msg.Type := CoherenceResponseType:WB_EXCLUSIVE_DIRTY;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+      } else {
+        out_msg.Type := CoherenceResponseType:WB_EXCLUSIVE_CLEAN;
+        // NOTE: in a real system this would not send data.  We send
+        // data here only so we can check it at the memory
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(u_writeDataToCache, "u", desc="Write data to cache") {
+    peek(responseToCache_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(uf_writeDataToCacheTBE, "uf", desc="Write data to TBE") {
+    peek(responseToCache_in, ResponseMsg) {
+      assert(is_valid(tbe));
+      tbe.DataBlk := in_msg.DataBlk;
+      tbe.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(v_writeDataToCacheVerify, "v", desc="Write data to cache, assert it was same as before") {
+    peek(responseToCache_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      DPRINTF(RubySlicc, "Cached Data Block: %s, Msg Data Block: %s\n",
+              cache_entry.DataBlk, in_msg.DataBlk);
+      assert(cache_entry.DataBlk == in_msg.DataBlk);
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty || cache_entry.Dirty;
+    }
+  }
+
+  action(vt_writeDataToTBEVerify, "vt", desc="Write data to TBE, assert it was same as before") {
+    peek(responseToCache_in, ResponseMsg) {
+      assert(is_valid(tbe));
+      DPRINTF(RubySlicc, "Cached Data Block: %s, Msg Data Block: %s\n",
+              tbe.DataBlk, in_msg.DataBlk);
+      assert(tbe.DataBlk == in_msg.DataBlk);
+      tbe.DataBlk := in_msg.DataBlk;
+      tbe.Dirty := in_msg.Dirty || tbe.Dirty;
+    }
+  }
+
+  action(gg_deallocateL1CacheBlock, "\g", desc="Deallocate cache block.  Sets the cache to invalid, allowing a replacement in parallel with a fetch.") {
+    if (L1Dcache.isTagPresent(address)) {
+      L1Dcache.deallocate(address);
+    } else {
+      L1Icache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(ii_allocateL1DCacheBlock, "\i", desc="Set L1 D-cache tag equal to tag of block B.") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L1Dcache.allocate(address, new Entry));
+    }
+  }
+
+  action(jj_allocateL1ICacheBlock, "\j", desc="Set L1 I-cache tag equal to tag of block B.") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L1Icache.allocate(address, new Entry));
+    }
+  }
+
+  action(vv_allocateL2CacheBlock, "\v", desc="Set L2 cache tag equal to tag of block B.") {
+    set_cache_entry(L2cache.allocate(address, new Entry));
+  }
+
+  action(rr_deallocateL2CacheBlock, "\r", desc="Deallocate L2 cache block.  Sets the cache to not present, allowing a replacement in parallel with a fetch.") {
+    L2cache.deallocate(address);
+    unset_cache_entry();
+  }
+
+  action(forward_eviction_to_cpu, "\cc", desc="sends eviction information to the processor") {
+    if (send_evictions) {
+      DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+      sequencer.evictionCallback(address);
+    }
+  }
+
+  action(uu_profileL1DataMiss, "\udm", desc="Profile the demand miss") {
+      ++L1Dcache.demand_misses;
+  }
+
+  action(uu_profileL1DataHit, "\udh", desc="Profile the demand hits") {
+      ++L1Dcache.demand_hits;
+  }
+
+  action(uu_profileL1InstMiss, "\uim", desc="Profile the demand miss") {
+      ++L1Icache.demand_misses;
+  }
+
+  action(uu_profileL1InstHit, "\uih", desc="Profile the demand hits") {
+      ++L1Icache.demand_hits;
+  }
+
+  action(uu_profileL2Miss, "\um", desc="Profile the demand miss") {
+      ++L2cache.demand_misses;
+  }
+
+  action(uu_profileL2Hit, "\uh", desc="Profile the demand hits ") {
+      ++L2cache.demand_hits;
+  }
+
+  action(zz_stallAndWaitMandatoryQueue, "\z", desc="Send the head of the mandatory queue to the back of the queue.") {
+    stall_and_wait(mandatoryQueue_in, address);    
+  }
+
+  action(z_stall, "z", desc="stall") {
+    // do nothing and the special z_stall action will return a protocol stall
+    // so that the next port is checked
+  }
+
+  action(kd_wakeUpDependents, "kd", desc="wake-up dependents") {
+    wakeUpBuffers(address);
+  }
+
+  action(ka_wakeUpAllDependents, "ka", desc="wake-up all dependents") {
+    wakeUpAllBuffers();
+  }
+
+  //*****************************************************
+  // TRANSITIONS
+  //*****************************************************
+
+  // Transitions for Load/Store/L2_Replacement from transient states
+  transition({IM, IM_F, MM_WF, SM, SM_F, ISM, ISM_F, OM, OM_F, IS, SS, OI, MI, II, IT, ST, OT, MT, MMT}, {Store, L2_Replacement}) {
+    zz_stallAndWaitMandatoryQueue;
+  }
+
+  transition({IM, IM_F, MM_WF, SM, SM_F, ISM, ISM_F, OM, OM_F, IS, SS, OI, MI, II}, {Flush_line}) {
+    zz_stallAndWaitMandatoryQueue;
+  }
+
+  transition({M_W, MM_W}, {L2_Replacement, Flush_line}) {
+    zz_stallAndWaitMandatoryQueue;
+  }
+
+  transition({IM, IS, OI, MI, II, IT, ST, OT, MT, MMT, MI_F, MM_F, OM_F, IM_F, ISM_F, SM_F, MM_WF}, {Load, Ifetch}) {
+    zz_stallAndWaitMandatoryQueue;
+  }
+
+  transition({IM, SM, ISM, OM, IS, SS, MM_W, M_W, OI, MI, II, IT, ST, OT, MT, MMT, IM_F, SM_F, ISM_F, OM_F, MM_WF, MI_F, MM_F, IR, SR, OR, MR, MMR}, L1_to_L2) {
+    zz_stallAndWaitMandatoryQueue;
+  }
+
+  transition({MI_F, MM_F}, {Store}) {
+    zz_stallAndWaitMandatoryQueue;
+  }
+
+  transition({MM_F, MI_F}, {Flush_line}) {
+    zz_stallAndWaitMandatoryQueue;
+  }
+
+  transition({IT, ST, OT, MT, MMT}, {Other_GETX, NC_DMA_GETS, Other_GETS, Merged_GETS, Other_GETS_No_Mig, Invalidate, Flush_line}) {
+    z_stall;
+  }
+
+  transition({IR, SR, OR, MR, MMR}, {Other_GETX, NC_DMA_GETS, Other_GETS, Merged_GETS, Other_GETS_No_Mig, Invalidate}) {
+    z_stall;
+  }
+
+  // Transitions moving data between the L1 and L2 caches
+  transition({I, S, O, M, MM}, L1_to_L2) {
+    i_allocateTBE;
+    gg_deallocateL1CacheBlock;
+    vv_allocateL2CacheBlock;
+    hp_copyFromTBEToL2;
+    s_deallocateTBE;
+  }
+
+  transition(I, Trigger_L2_to_L1D, IT) {
+    i_allocateTBE;
+    rr_deallocateL2CacheBlock;
+    ii_allocateL1DCacheBlock;
+    nb_copyFromTBEToL1; // Not really needed for state I
+    s_deallocateTBE;
+    zz_stallAndWaitMandatoryQueue;
+    ll_L2toL1Transfer;
+  }
+
+  transition(S, Trigger_L2_to_L1D, ST) {
+    i_allocateTBE;
+    rr_deallocateL2CacheBlock;
+    ii_allocateL1DCacheBlock;
+    nb_copyFromTBEToL1;
+    s_deallocateTBE;
+    zz_stallAndWaitMandatoryQueue;
+    ll_L2toL1Transfer;
+  }
+
+  transition(O, Trigger_L2_to_L1D, OT) {
+    i_allocateTBE;
+    rr_deallocateL2CacheBlock;
+    ii_allocateL1DCacheBlock;
+    nb_copyFromTBEToL1;
+    s_deallocateTBE;
+    zz_stallAndWaitMandatoryQueue;
+    ll_L2toL1Transfer;
+  }
+
+  transition(M, Trigger_L2_to_L1D, MT) {
+    i_allocateTBE;
+    rr_deallocateL2CacheBlock;
+    ii_allocateL1DCacheBlock;
+    nb_copyFromTBEToL1;
+    s_deallocateTBE;
+    zz_stallAndWaitMandatoryQueue;
+    ll_L2toL1Transfer;
+  }
+
+  transition(MM, Trigger_L2_to_L1D, MMT) {
+    i_allocateTBE;
+    rr_deallocateL2CacheBlock;
+    ii_allocateL1DCacheBlock;
+    nb_copyFromTBEToL1;
+    s_deallocateTBE;
+    zz_stallAndWaitMandatoryQueue;
+    ll_L2toL1Transfer;
+  }
+
+  transition(I, Trigger_L2_to_L1I, IT) {
+    i_allocateTBE;
+    rr_deallocateL2CacheBlock;
+    jj_allocateL1ICacheBlock;
+    nb_copyFromTBEToL1;
+    s_deallocateTBE;
+    zz_stallAndWaitMandatoryQueue;
+    ll_L2toL1Transfer;
+  }
+
+  transition(S, Trigger_L2_to_L1I, ST) {
+    i_allocateTBE;
+    rr_deallocateL2CacheBlock;
+    jj_allocateL1ICacheBlock;
+    nb_copyFromTBEToL1;
+    s_deallocateTBE;
+    zz_stallAndWaitMandatoryQueue;
+    ll_L2toL1Transfer;
+  }
+
+  transition(O, Trigger_L2_to_L1I, OT) {
+    i_allocateTBE;
+    rr_deallocateL2CacheBlock;
+    jj_allocateL1ICacheBlock;
+    nb_copyFromTBEToL1;
+    s_deallocateTBE;
+    zz_stallAndWaitMandatoryQueue;
+    ll_L2toL1Transfer;
+  }
+
+  transition(M, Trigger_L2_to_L1I, MT) {
+    i_allocateTBE;
+    rr_deallocateL2CacheBlock;
+    jj_allocateL1ICacheBlock;
+    nb_copyFromTBEToL1;
+    s_deallocateTBE;
+    zz_stallAndWaitMandatoryQueue;
+    ll_L2toL1Transfer;
+  }
+
+  transition(MM, Trigger_L2_to_L1I, MMT) {
+    i_allocateTBE;
+    rr_deallocateL2CacheBlock;
+    jj_allocateL1ICacheBlock;
+    nb_copyFromTBEToL1;
+    s_deallocateTBE;
+    zz_stallAndWaitMandatoryQueue;
+    ll_L2toL1Transfer;
+  }
+
+  transition(IT, Complete_L2_to_L1, IR) {
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(ST, Complete_L2_to_L1, SR) {
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(OT, Complete_L2_to_L1, OR) {
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(MT, Complete_L2_to_L1, MR) {
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(MMT, Complete_L2_to_L1, MMR) {
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  // Transitions from Idle
+  transition({I,IR}, Load, IS) {
+    ii_allocateL1DCacheBlock;
+    i_allocateTBE;
+    a_issueGETS;
+    uu_profileL1DataMiss;
+    uu_profileL2Miss;
+    k_popMandatoryQueue;
+  }
+
+  transition({I,IR}, Ifetch, IS) {
+    jj_allocateL1ICacheBlock;
+    i_allocateTBE;
+    a_issueGETS;
+    uu_profileL1InstMiss;
+    uu_profileL2Miss;
+    k_popMandatoryQueue;
+  }
+
+  transition({I,IR}, Store, IM) {
+    ii_allocateL1DCacheBlock;
+    i_allocateTBE;
+    b_issueGETX;
+    uu_profileL1DataMiss;
+    uu_profileL2Miss;
+    k_popMandatoryQueue;
+  }
+
+  transition({I, IR}, Flush_line, IM_F) {
+    it_allocateTBE;
+    bf_issueGETF;
+    k_popMandatoryQueue;
+  }
+
+  transition(I, L2_Replacement) {
+    rr_deallocateL2CacheBlock;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(I, {Other_GETX, NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig, Invalidate}) {
+    f_sendAck;
+    l_popForwardQueue;
+  }
+
+  // Transitions from Shared
+  transition({S, SM, ISM}, Load) {
+    h_load_hit;
+    uu_profileL1DataHit;
+    k_popMandatoryQueue;
+  }
+
+  transition({S, SM, ISM}, Ifetch) {
+    h_load_hit;
+    uu_profileL1InstHit;
+    k_popMandatoryQueue;
+  }
+
+  transition(SR, Load, S) {
+    h_load_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(SR, Ifetch, S) {
+    h_load_hit;
+    uu_profileL1InstMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition({S,SR}, Store, SM) {
+    i_allocateTBE;
+    b_issueGETX;
+    uu_profileL1DataMiss;
+    uu_profileL2Miss;
+    k_popMandatoryQueue;
+  }
+
+  transition({S, SR}, Flush_line, SM_F) {
+    i_allocateTBE;
+    bf_issueGETF;
+    forward_eviction_to_cpu;
+    gg_deallocateL1CacheBlock;
+    k_popMandatoryQueue;
+  }
+
+  transition(S, L2_Replacement, I) {
+    forward_eviction_to_cpu;
+    rr_deallocateL2CacheBlock;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(S, {Other_GETX, Invalidate}, I) {
+    f_sendAck;
+    forward_eviction_to_cpu;
+    l_popForwardQueue;
+  }
+
+  transition(S, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}) {
+    ff_sendAckShared;
+    l_popForwardQueue;
+  }
+
+  // Transitions from Owned
+  transition({O, OM, SS, MM_W, M_W}, {Load}) {
+    h_load_hit;
+    uu_profileL1DataHit;
+    k_popMandatoryQueue;
+  }
+
+  transition({O, OM, SS, MM_W, M_W}, {Ifetch}) {
+    h_load_hit;
+    uu_profileL1InstHit;
+    k_popMandatoryQueue;
+  }
+
+  transition(OR, Load, O) {
+    h_load_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(OR, Ifetch, O) {
+    h_load_hit;
+    uu_profileL1InstMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition({O,OR}, Store, OM) {
+    i_allocateTBE;
+    b_issueGETX;
+    p_decrementNumberOfMessagesByOne;
+    uu_profileL1DataMiss;
+    uu_profileL2Miss;
+    k_popMandatoryQueue;
+  }
+
+  transition({O, OR}, Flush_line, OM_F) {
+    i_allocateTBE;
+    bf_issueGETF;
+    p_decrementNumberOfMessagesByOne;
+    forward_eviction_to_cpu;
+    gg_deallocateL1CacheBlock;
+    k_popMandatoryQueue;
+  }
+
+  transition(O, L2_Replacement, OI) {
+    i_allocateTBE;
+    d_issuePUT;
+    forward_eviction_to_cpu;
+    rr_deallocateL2CacheBlock;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(O, {Other_GETX, Invalidate}, I) {
+    e_sendData;
+    forward_eviction_to_cpu;
+    l_popForwardQueue;
+  }
+
+  transition(O, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}) {
+    ee_sendDataShared;
+    l_popForwardQueue;
+  }
+
+  transition(O, Merged_GETS) {
+    em_sendDataSharedMultiple;
+    l_popForwardQueue;
+  }
+
+  // Transitions from Modified
+  transition({MM, M}, {Ifetch}) {
+    h_load_hit;
+    uu_profileL1InstHit;
+    k_popMandatoryQueue;
+  }
+
+  transition({MM, M}, {Load}) {
+    h_load_hit;
+    uu_profileL1DataHit;
+    k_popMandatoryQueue;
+  }
+
+  transition(MM, Store) {
+    hh_store_hit;
+    uu_profileL1DataHit;
+    k_popMandatoryQueue;
+  }
+
+  transition(MMR, Load, MM) {
+    h_load_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(MMR, Ifetch, MM) {
+    h_load_hit;
+    uu_profileL1InstMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(MMR, Store, MM) {
+    hh_store_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition({MM, M, MMR, MR}, Flush_line, MM_F) {
+    i_allocateTBE;
+    bf_issueGETF;
+    p_decrementNumberOfMessagesByOne;
+    forward_eviction_to_cpu;
+    gg_deallocateL1CacheBlock;
+    k_popMandatoryQueue;
+  }
+
+  transition(MM_F, Block_Ack, MI_F) {
+    df_issuePUTF;
+    l_popForwardQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(MM, L2_Replacement, MI) {
+    i_allocateTBE;
+    d_issuePUT;
+    forward_eviction_to_cpu;
+    rr_deallocateL2CacheBlock;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(MM, {Other_GETX, Invalidate}, I) {
+    c_sendExclusiveData;
+    forward_eviction_to_cpu;
+    l_popForwardQueue;
+  }
+
+  transition(MM, Other_GETS, I) {
+    c_sendExclusiveData;
+    forward_eviction_to_cpu;
+    l_popForwardQueue;
+  }
+
+  transition(MM, NC_DMA_GETS, O) {
+    ee_sendDataShared;
+    l_popForwardQueue;
+  }
+
+  transition(MM, Other_GETS_No_Mig, O) {
+    ee_sendDataShared;
+    l_popForwardQueue;
+  }
+
+  transition(MM, Merged_GETS, O) {
+    em_sendDataSharedMultiple;
+    l_popForwardQueue;
+  }
+
+  // Transitions from Dirty Exclusive
+  transition(M, Store, MM) {
+    hh_store_hit;
+    uu_profileL1DataHit;
+    k_popMandatoryQueue;
+  }
+
+  transition(MR, Load, M) {
+    h_load_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(MR, Ifetch, M) {
+    h_load_hit;
+    uu_profileL1InstMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(MR, Store, MM) {
+    hh_store_hit;
+    uu_profileL1DataMiss;
+    uu_profileL2Hit;
+    k_popMandatoryQueue;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(M, L2_Replacement, MI) {
+    i_allocateTBE;
+    d_issuePUT;
+    forward_eviction_to_cpu;
+    rr_deallocateL2CacheBlock;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(M, {Other_GETX, Invalidate}, I) {
+    c_sendExclusiveData;
+    forward_eviction_to_cpu;
+    l_popForwardQueue;
+  }
+
+  transition(M, {Other_GETS, Other_GETS_No_Mig}, O) {
+    ee_sendDataShared;
+    l_popForwardQueue;
+  }
+
+  transition(M, NC_DMA_GETS, O) {
+    ee_sendDataShared;
+    l_popForwardQueue;
+  }
+
+  transition(M, Merged_GETS, O) {
+    em_sendDataSharedMultiple;
+    l_popForwardQueue;
+  }
+
+  // Transitions from IM
+
+  transition({IM, IM_F}, {Other_GETX, NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig, Invalidate}) {
+    f_sendAck;
+    l_popForwardQueue;
+  }
+
+  transition({IM, IM_F, MM_F}, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(IM, Data, ISM) {
+    u_writeDataToCache;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(IM_F, Data, ISM_F) {
+      uf_writeDataToCacheTBE;
+      m_decrementNumberOfMessages;
+      o_checkForCompletion;
+      n_popResponseQueue;
+  }
+
+  transition(IM, Exclusive_Data, MM_W) {
+    u_writeDataToCache;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    sx_external_store_hit;
+    n_popResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(IM_F, Exclusive_Data, MM_WF) {
+      uf_writeDataToCacheTBE;
+      m_decrementNumberOfMessages;
+      o_checkForCompletion;
+      n_popResponseQueue;
+  }
+
+  // Transitions from SM
+  transition({SM, SM_F}, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}) {
+    ff_sendAckShared;
+    l_popForwardQueue;
+  }
+
+  transition(SM, {Other_GETX, Invalidate}, IM) {
+    f_sendAck;
+    forward_eviction_to_cpu;
+    l_popForwardQueue;
+  }
+
+  transition(SM_F, {Other_GETX, Invalidate}, IM_F) {
+    f_sendAck;
+    forward_eviction_to_cpu;
+    l_popForwardQueue;
+  }
+
+  transition({SM, SM_F}, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(SM, {Data, Exclusive_Data}, ISM) {
+    v_writeDataToCacheVerify;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(SM_F, {Data, Exclusive_Data}, ISM_F) {
+    vt_writeDataToTBEVerify;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  // Transitions from ISM
+  transition({ISM, ISM_F}, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(ISM, All_acks_no_sharers, MM) {
+    sxt_trig_ext_store_hit;
+    gm_sendUnblockM;
+    s_deallocateTBE;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(ISM_F, All_acks_no_sharers, MI_F) {
+    df_issuePUTF;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  // Transitions from OM
+
+  transition(OM, {Other_GETX, Invalidate}, IM) {
+    e_sendData;
+    pp_incrementNumberOfMessagesByOne;
+    forward_eviction_to_cpu;
+    l_popForwardQueue;
+  }
+
+  transition(OM_F, {Other_GETX, Invalidate}, IM_F) {
+    q_sendDataFromTBEToCache;
+    pp_incrementNumberOfMessagesByOne;
+    forward_eviction_to_cpu;
+    l_popForwardQueue;
+  }
+
+  transition(OM, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}) {
+    ee_sendDataShared;
+    l_popForwardQueue;
+  }
+
+  transition(OM, Merged_GETS) {
+    em_sendDataSharedMultiple;
+    l_popForwardQueue;
+  }
+
+  transition(OM_F, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}) {
+    et_sendDataSharedFromTBE;
+    l_popForwardQueue;
+  }
+
+  transition(OM_F, Merged_GETS) {
+    emt_sendDataSharedMultipleFromTBE;
+    l_popForwardQueue;
+  }
+
+  transition({OM, OM_F}, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(OM, {All_acks, All_acks_no_sharers}, MM) {
+    sxt_trig_ext_store_hit;
+    gm_sendUnblockM;
+    s_deallocateTBE;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition({MM_F, OM_F}, {All_acks, All_acks_no_sharers}, MI_F) {
+    df_issuePUTF;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+  // Transitions from IS
+
+  transition(IS, {Other_GETX, NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig, Invalidate}) {
+    f_sendAck;
+    l_popForwardQueue;
+  }
+
+  transition(IS, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(IS, Shared_Ack) {
+    m_decrementNumberOfMessages;
+    r_setSharerBit;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(IS, Data, SS) {
+    u_writeDataToCache;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    hx_external_load_hit;
+    uo_updateCurrentOwner;
+    n_popResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(IS, Exclusive_Data, M_W) {
+    u_writeDataToCache;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    hx_external_load_hit;
+    n_popResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(IS, Shared_Data, SS) {
+    u_writeDataToCache;
+    r_setSharerBit;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    hx_external_load_hit;
+    uo_updateCurrentOwner;
+    n_popResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  // Transitions from SS
+
+  transition(SS, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(SS, Shared_Ack) {
+    m_decrementNumberOfMessages;
+    r_setSharerBit;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(SS, All_acks, S) {
+    gs_sendUnblockS;
+    s_deallocateTBE;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(SS, All_acks_no_sharers, S) {
+    // Note: The directory might still be the owner, so that is why we go to S
+    gs_sendUnblockS;
+    s_deallocateTBE;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  // Transitions from MM_W
+
+  transition(MM_W, Store) {
+    hh_store_hit;
+    uu_profileL1DataHit;
+    k_popMandatoryQueue;
+  }
+
+  transition({MM_W, MM_WF}, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(MM_W, All_acks_no_sharers, MM) {
+    gm_sendUnblockM;
+    s_deallocateTBE;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(MM_WF, All_acks_no_sharers, MI_F) {
+    df_issuePUTF;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+  // Transitions from M_W
+
+  transition(M_W, Store, MM_W) {
+    hh_store_hit;
+    uu_profileL1DataHit;
+    k_popMandatoryQueue;
+  }
+
+  transition(M_W, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(M_W, All_acks_no_sharers, M) {
+    gm_sendUnblockM;
+    s_deallocateTBE;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  // Transitions from OI/MI
+
+  transition({OI, MI}, {Other_GETX, Invalidate}, II) {
+    q_sendDataFromTBEToCache;
+    l_popForwardQueue;
+  }
+
+  transition({OI, MI}, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}, OI) {
+    sq_sendSharedDataFromTBEToCache;
+    l_popForwardQueue;
+  }
+
+  transition({OI, MI}, Merged_GETS, OI) {
+    qm_sendDataFromTBEToCache;
+    l_popForwardQueue;
+  }
+
+  transition(MI, Writeback_Ack, I) {
+    t_sendExclusiveDataFromTBEToMemory;
+    s_deallocateTBE;
+    l_popForwardQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(MI_F, Writeback_Ack, I) {
+      hh_flush_hit;
+      t_sendExclusiveDataFromTBEToMemory;
+      s_deallocateTBE;
+      l_popForwardQueue;
+      kd_wakeUpDependents;
+  }
+
+  transition(OI, Writeback_Ack, I) {
+    qq_sendDataFromTBEToMemory;
+    s_deallocateTBE;
+    l_popForwardQueue;
+    kd_wakeUpDependents;
+  }
+
+  // Transitions from II
+  transition(II, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig, Other_GETX, Invalidate}, II) {
+    f_sendAck;
+    l_popForwardQueue;
+  }
+
+  transition(II, Writeback_Ack, I) {
+    g_sendUnblock;
+    s_deallocateTBE;
+    l_popForwardQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(II, Writeback_Nack, I) {
+    s_deallocateTBE;
+    l_popForwardQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(MM_F, {Other_GETX, Invalidate}, IM_F) {
+    ct_sendExclusiveDataFromTBE;
+    pp_incrementNumberOfMessagesByOne;
+    l_popForwardQueue;
+  }
+
+  transition(MM_F, Other_GETS, IM_F) {
+    ct_sendExclusiveDataFromTBE;
+    pp_incrementNumberOfMessagesByOne;
+    l_popForwardQueue;
+  }
+
+  transition(MM_F, NC_DMA_GETS, OM_F) {
+    sq_sendSharedDataFromTBEToCache;
+    l_popForwardQueue;
+  }
+
+  transition(MM_F, Other_GETS_No_Mig, OM_F) {
+    et_sendDataSharedFromTBE;
+    l_popForwardQueue;
+  }
+
+  transition(MM_F, Merged_GETS, OM_F) {
+    emt_sendDataSharedMultipleFromTBE;
+    l_popForwardQueue;
+  }
+}
diff -r 3ee9d80f490f -r 7b001aa001f0 src/mem/protocol/MOESI_hammer_bcu-BCU.sm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/mem/protocol/MOESI_hammer_bcu-BCU.sm	Wed Dec 02 17:08:48 2015 -0600
@@ -0,0 +1,252 @@
+
+
+
+machine(BorderControlUnit, "Border control unit")
+: Cycles latency := 1;
+
+    // NOTE: I'm pretty sure the directory to cache side is not necessary.
+    //       But we'll just never send messages to this controller over these
+    //       networks.
+
+    // Interface with the cache (copied from MOESI_hammer-dir.sm)
+    // MessageBuffer * forwardFromDirToCache, network="To", virtual_network="3", ordered="false", vnet_type="forward";
+    // MessageBuffer * responseFromDirToCache, network="To", virtual_network="4", ordered="false", vnet_type="response";
+
+    MessageBuffer * unblockToDirFromCache, network="From", virtual_network="5", ordered="false", vnet_type="unblock";
+    MessageBuffer * responseToDirFromCache, network="From", virtual_network="4", ordered="false", vnet_type="response";
+    MessageBuffer * requestToDirFromCache, network="From", virtual_network="2", ordered="false", vnet_type="request", recycle_latency="1";
+
+    // Interface with the directory (copied from MOESI_hammer-cache.sm)
+    MessageBuffer * requestFromCacheToDir, network="To", virtual_network="2", ordered="false", vnet_type="request";
+    MessageBuffer * responseFromCacheToDir, network="To", virtual_network="4", ordered="false", vnet_type="response";
+    MessageBuffer * unblockFromCacheToDir, network="To", virtual_network="5", ordered="false", vnet_type="unblock";
+
+    // MessageBuffer * forwardToCacheFromDir, network="From", virtual_network="3", ordered="false", vnet_type="forward";
+    // MessageBuffer * responseToCacheFromDir, network="From", virtual_network="4", ordered="false", vnet_type="response";
+{
+    
+    state_declaration(State, desc="Cache states") {
+        I, AccessPermission:Invalid, desc="Idle";
+    }
+
+    enumeration(Event, desc="BCU events") {
+        //FwdFromDir;
+        RespFromCache;
+        UnblockFromCache;
+        //ResponseFromDir;
+        RequestFromCache;
+    }
+
+    DataBlock blk;
+
+    DataBlock getDataBlock(Address addr), return_by_ref="yes" {
+        return blk;
+    }
+
+    AccessPermission getAccessPermission(Address addr) {
+        return AccessPermission:NotPresent;
+    }
+
+    void setAccessPermission(Address addr, State state) {
+    }
+
+    State getState(Address addr) {
+        return State:I;
+    }
+
+    void setState(Address addr, State state) {
+    }
+
+    //out_port(fwdNetwork_out, RequestMsg, forwardFromDirToCache);
+    //out_port(responseNetworkToCache_out, ResponseMsg, responseFromDirToCache);
+    out_port(requestNetwork_out, RequestMsg, requestFromCacheToDir);
+    out_port(responseNetworkToDir_out, ResponseMsg, responseFromCacheToDir);
+    out_port(unblockNetwork_out, ResponseMsg, unblockFromCacheToDir);
+
+    // Think about if this is the right order!!
+
+    in_port(unblock_in, ResponseMsg, unblockToDirFromCache) {
+        if (unblock_in.isReady()) {
+            peek(unblock_in, ResponseMsg) {
+                trigger(Event:UnblockFromCache, in_msg.Addr);
+                // NOTE: no need for the entry or the TBE unless we explicitly
+                //       add those things to this controller. (see line 81 of
+                //       InPortDeclAST.py)
+            }
+        }
+    }
+    
+    // in_port(respFromDir_in, ResponseMsg, responseToCacheFromDir) {
+    //     if (respFromDir_in.isReady()) {
+    //         peek(respFromDir_in, ResponseMsg) {
+    //             trigger(Event:ResponseFromDir, in_msg.Addr);
+    //         }
+    //     }
+    // }
+    
+    in_port(respFromCache_in, ResponseMsg, responseToDirFromCache) {
+        if (respFromCache_in.isReady()) {
+            peek(respFromCache_in, ResponseMsg) {
+                trigger(Event:RespFromCache, in_msg.Addr);
+            }
+        }
+    }
+    
+    in_port(requestFromCache_in, RequestMsg, requestToDirFromCache) {
+        if (requestFromCache_in.isReady()) {
+            peek(requestFromCache_in, RequestMsg) {
+                trigger(Event:RequestFromCache, in_msg.Addr);
+            }
+        }
+    }
+    
+    // in_port(fwdFromDir_in, ResponseMsg, forwardToCacheFromDir) {
+    //     if (fwdFromDir_in.isReady()) {
+    //         peek(fwdFromDir_in, ResponseMsg) {
+    //             trigger(Event:FwdFromDir, in_msg.Addr);
+    //         }
+    //     }
+    // }
+
+    // action(ff_fwdtocache, "ff", desc="FwdFromDir") {
+    //     peek(fwdFromDir_in, ResponseMsg) {
+    //         enqueue(fwdNetwork_out, ResponseMsg, latency) {
+    //             out_msg.Addr := in_msg.Addr;
+    //             out_msg.Type := in_msg.Type;
+    //             out_msg.Sender := in_msg.Sender;
+    //             out_msg.CurOwner := in_msg.CurOwner;
+    //             out_msg.Destination := in_msg.OriginalDestination;
+    //             out_msg.DataBlk := in_msg.DataBlk;
+    //             out_msg.Dirty := in_msg.Dirty;
+    //             out_msg.Acks := in_msg.Acks;
+    //             out_msg.MessageSize := in_msg.MessageSize;
+    //             out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+    //             out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+    //             out_msg.SilentAcks := in_msg.SilentAcks;
+    //         }
+    //     }
+    // }
+
+    action(rsd_resptodir, "rtd", desc="RespFromCache") {
+        peek(respFromCache_in, ResponseMsg) {
+            DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr);
+            enqueue(responseNetworkToDir_out, ResponseMsg, latency) {
+                out_msg.Addr := in_msg.Addr;
+                out_msg.Type := in_msg.Type;
+                out_msg.Sender := in_msg.Sender;
+                out_msg.CurOwner := in_msg.CurOwner;
+                out_msg.Destination := in_msg.OriginalDestination;
+                out_msg.DataBlk := in_msg.DataBlk;
+                out_msg.Dirty := in_msg.Dirty;
+                out_msg.Acks := in_msg.Acks;
+                out_msg.MessageSize := in_msg.MessageSize;
+                out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+                out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+                out_msg.SilentAcks := in_msg.SilentAcks;
+            }
+        }
+    }
+
+    // action(rsc_resptocache, "rsc", desc="ResponseFromDir") {
+    //     peek(respFromDir_in, ResponseMsg) {
+    //         enqueue(responseNetworkToCache_out, ResponseMsg, latency) {
+    //             out_msg.Addr := in_msg.Addr;
+    //             out_msg.Type := in_msg.Type;
+    //             out_msg.Sender := in_msg.Sender;
+    //             out_msg.CurOwner := in_msg.CurOwner;
+    //             out_msg.Destination := in_msg.OriginalDestination;
+    //             out_msg.DataBlk := in_msg.DataBlk;
+    //             out_msg.Dirty := in_msg.Dirty;
+    //             out_msg.Acks := in_msg.Acks;
+    //             out_msg.MessageSize := in_msg.MessageSize;
+    //             out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+    //             out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+    //             out_msg.SilentAcks := in_msg.SilentAcks;
+    //         }
+    //     }
+    // }
+
+    action(uc_unblocktodir, "uc", desc="UnblockFromCache") {
+        peek(unblock_in, ResponseMsg) {
+            DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr);
+            enqueue(unblockNetwork_out, ResponseMsg, latency) {
+                out_msg.Addr := in_msg.Addr;
+                out_msg.Type := in_msg.Type;
+                out_msg.Sender := in_msg.Sender;
+                out_msg.CurOwner := in_msg.CurOwner;
+                out_msg.Destination := in_msg.OriginalDestination;
+                out_msg.DataBlk := in_msg.DataBlk;
+                out_msg.Dirty := in_msg.Dirty;
+                out_msg.Acks := in_msg.Acks;
+                out_msg.MessageSize := in_msg.MessageSize;
+                out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+                out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+                out_msg.SilentAcks := in_msg.SilentAcks;
+            }
+        }
+    }
+
+    action(rqc_reqtodir, "rqc", desc="RequestFromCache") {
+        peek(requestFromCache_in, RequestMsg) {
+            DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr);
+            enqueue(requestNetwork_out, RequestMsg, latency) {
+                out_msg.Addr := in_msg.Addr;
+                out_msg.Type := in_msg.Type;
+                out_msg.Requestor := in_msg.Requestor;
+                out_msg.MergedRequestors := in_msg.MergedRequestors;
+                out_msg.Destination := in_msg.OriginalDestination;
+                out_msg.MessageSize := in_msg.MessageSize;
+                out_msg.DirectedProbe := in_msg.DirectedProbe;
+                out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+                out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+                out_msg.SilentAcks := in_msg.SilentAcks;
+            }
+        }
+    }
+
+    action(pu_popunblock, "pu", desc="") {
+        unblock_in.dequeue();
+    }
+
+    // action(prfd_poprespFromDir, "prfd", desc="") {
+    //     respFromDir_in.dequeue();
+    // }
+
+    action(prfc_poprespFromCache, "prfc", desc="") {
+        respFromCache_in.dequeue();
+    }
+
+    action(pq_popreqFromCache, "pq", desc="") {
+        requestFromCache_in.dequeue();
+    }
+
+    // action(pf_popfwdFromDir, "pf", desc="") {
+    //     fwdFromDir_in.dequeue();
+    // }
+
+
+    // transition({I}, {FwdFromDir}) {
+    //     ff_fwdtocache;
+    //     pf_popfwdFromDir;
+    // }
+
+    transition({I}, {RespFromCache}) {
+        rsd_resptodir;
+        prfc_poprespFromCache;
+    }
+
+    transition({I}, {UnblockFromCache}) {
+        uc_unblocktodir;
+        pu_popunblock;
+    }
+
+    // transition({I}, {ResponseFromDir}) {
+    //     rsc_resptocache;
+    //     prfd_poprespFromDir;
+    // }
+
+    transition({I}, {RequestFromCache}) {
+        rqc_reqtodir;
+        pq_popreqFromCache;
+    }
+}
\ No newline at end of file
diff -r 3ee9d80f490f -r 7b001aa001f0 src/mem/protocol/MOESI_hammer_bcu-msg.sm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/mem/protocol/MOESI_hammer_bcu-msg.sm	Wed Dec 02 17:08:48 2015 -0600
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 1999-2008 Mark D. Hill and David A. Wood
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * AMD's contributions to the MOESI hammer protocol do not constitute an 
+ * endorsement of its similarity to any AMD products.
+ */
+
+// CoherenceRequestType
+enumeration(CoherenceRequestType, desc="...") {
+  GETX,      desc="Get eXclusive";
+  GETS,      desc="Get Shared";
+  MERGED_GETS, desc="Get Shared";
+  PUT,       desc="Put Ownership";
+  WB_ACK,    desc="Writeback ack";
+  WB_NACK,   desc="Writeback neg. ack";
+  PUTF,      desc="PUT on a Flush";
+  GETF,      desc="Issue exclusive for Flushing";
+  BLOCK_ACK, desc="Dir Block ack";
+  INV,       desc="Invalidate";
+}
+
+// CoherenceResponseType
+enumeration(CoherenceResponseType, desc="...") {
+  ACK,                desc="ACKnowledgment, responder does not have a copy";
+  ACK_SHARED,         desc="ACKnowledgment, responder has a shared copy";
+  DATA,               desc="Data, responder does not have a copy";
+  DATA_SHARED,        desc="Data, responder has a shared copy";
+  DATA_EXCLUSIVE,     desc="Data, responder was exclusive, gave us a copy, and they went to invalid";
+  WB_CLEAN,           desc="Clean writeback";
+  WB_DIRTY,           desc="Dirty writeback";
+  WB_EXCLUSIVE_CLEAN, desc="Clean writeback of exclusive data";
+  WB_EXCLUSIVE_DIRTY, desc="Dirty writeback of exclusive data";
+  UNBLOCK,            desc="Unblock for writeback";
+  UNBLOCKS,            desc="Unblock now in S";
+  UNBLOCKM,            desc="Unblock now in M/O/E";
+  NULL,               desc="Null value";
+}
+
+// TriggerType
+enumeration(TriggerType, desc="...") {
+  L2_to_L1,            desc="L2 to L1 transfer";
+  ALL_ACKS,            desc="See corresponding event";
+  ALL_ACKS_OWNER_EXISTS,desc="See corresponding event";
+  ALL_ACKS_NO_SHARERS, desc="See corresponding event";
+  ALL_UNBLOCKS,        desc="all unblockS received";
+}
+
+// TriggerMsg
+structure(TriggerMsg, desc="...", interface="Message") {
+  Address Addr,             desc="Physical address for this request";
+  TriggerType Type,            desc="Type of trigger";
+
+  bool functionalRead(Packet *pkt) {
+    // Trigger messages do not hold any data!
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // Trigger messages do not hold any data!
+    return false;
+  }
+}
+
+// RequestMsg (and also forwarded requests)
+structure(RequestMsg, desc="...", interface="NetworkMessage") {
+  Address Addr,             desc="Physical address for this request";
+  CoherenceRequestType Type,   desc="Type of request (GetS, GetX, PutX, etc)";
+  MachineID Requestor,            desc="Node who initiated the request";
+  NetDest MergedRequestors,    desc="Merge set of read requestors";
+  NetDest Destination,             desc="Multicast destination mask";
+  NetDest OriginalDestination,     desc="Multicast destination mask";
+  MessageSizeType MessageSize, desc="size category of the message";
+  bool DirectedProbe, default="false", desc="probe filter directed probe";
+
+  Cycles InitialRequestTime, default="Cycles(0)",
+        desc="time the initial requests was sent from the L1Cache";
+  Cycles ForwardRequestTime, default="Cycles(0)",
+        desc="time the dir forwarded the request";
+  int SilentAcks, default="0", desc="silent acks from the full-bit directory";
+
+  bool functionalRead(Packet *pkt) {
+    // Request messages do not hold any data
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // Request messages do not hold any data
+    return false;
+  }
+}
+
+// ResponseMsg (and also unblock requests)
+structure(ResponseMsg, desc="...", interface="NetworkMessage") {
+  Address Addr,             desc="Physical address for this request";
+  CoherenceResponseType Type,  desc="Type of response (Ack, Data, etc)";
+  MachineID Sender,               desc="Node who sent the data";
+  MachineID CurOwner,      desc="current owner of the block, used for UnblockS responses";
+  NetDest Destination,             desc="Node to whom the data is sent";
+  NetDest OriginalDestination,     desc="Multicast destination mask";
+  DataBlock DataBlk,           desc="data for the cache line";
+  bool Dirty,                  desc="Is the data dirty (different than memory)?";
+  int Acks, default="0",    desc="How many messages this counts as";
+  MessageSizeType MessageSize, desc="size category of the message";
+
+  Cycles InitialRequestTime, default="Cycles(0)",
+        desc="time the initial requests was sent from the L1Cache";
+  Cycles ForwardRequestTime, default="Cycles(0)",
+        desc="time the dir forwarded the request";
+  int SilentAcks, default="0", desc="silent acks from the full-bit directory";
+
+  bool functionalRead(Packet *pkt) {
+    // The check below ensures that data is read only from messages that
+    // actually hold data.
+    if (Type == CoherenceResponseType:DATA ||
+        Type == CoherenceResponseType:DATA_SHARED ||
+        Type == CoherenceResponseType:DATA_EXCLUSIVE ||
+        Type == CoherenceResponseType:WB_DIRTY ||
+        Type == CoherenceResponseType:WB_EXCLUSIVE_DIRTY) {
+        return testAndRead(Addr, DataBlk, pkt);
+    }
+
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // Message type does not matter since all messages are written.
+    // If a protocol reads data from a packet that is not supposed
+    // to hold the data, then the fault lies with the protocol.
+    return testAndWrite(Addr, DataBlk, pkt);
+  }
+}
+
+enumeration(DMARequestType, desc="...", default="DMARequestType_NULL") {
+  READ,          desc="Memory Read";
+  WRITE,         desc="Memory Write";
+  NULL,          desc="Invalid";
+}
+
+enumeration(DMAResponseType, desc="...", default="DMAResponseType_NULL") {
+  DATA,          desc="DATA read";
+  ACK,           desc="ACK write";
+  NULL,          desc="Invalid";
+}
+
+structure(DMARequestMsg, desc="...", interface="NetworkMessage") {
+  DMARequestType Type,       desc="Request type (read/write)";
+  Address PhysicalAddress,   desc="Physical address for this request";
+  Address LineAddress,       desc="Line address for this request";
+  MachineID Requestor,            desc="Node who initiated the request";
+  NetDest Destination,       desc="Destination";
+  DataBlock DataBlk,         desc="DataBlk attached to this request";
+  int Len,                   desc="The length of the request";
+  MessageSizeType MessageSize, desc="size category of the message";
+
+  bool functionalRead(Packet *pkt) {
+    return testAndRead(LineAddress, DataBlk, pkt);
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    return testAndWrite(LineAddress, DataBlk, pkt);
+  }
+}
+
+structure(DMAResponseMsg, desc="...", interface="NetworkMessage") {
+  DMAResponseType Type,      desc="Response type (DATA/ACK)";
+  Address PhysicalAddress,   desc="Physical address for this request";
+  Address LineAddress,       desc="Line address for this request";
+  NetDest Destination,       desc="Destination";
+  DataBlock DataBlk,         desc="DataBlk attached to this request";
+  MessageSizeType MessageSize, desc="size category of the message";
+
+  bool functionalRead(Packet *pkt) {
+    return testAndRead(LineAddress, DataBlk, pkt);
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    return testAndWrite(LineAddress, DataBlk, pkt);
+  }
+}
diff -r 3ee9d80f490f -r 7b001aa001f0 src/mem/protocol/MOESI_hammer_bcu.slicc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/mem/protocol/MOESI_hammer_bcu.slicc	Wed Dec 02 17:08:48 2015 -0600
@@ -0,0 +1,7 @@
+protocol "MOESI_hammer_bcu";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_hammer_bcu-msg.sm";
+include "MOESI_hammer_bcu-BCU.sm";
+include "MOESI_hammer-GPUcache.sm";
+include "MOESI_hammer-dir.sm";
+include "MOESI_hammer-dma.sm";
\ No newline at end of file
diff -r 3ee9d80f490f -r 7b001aa001f0 src/mem/protocol/SConsopts
--- a/src/mem/protocol/SConsopts	Wed Dec 02 17:08:48 2015 -0600
+++ b/src/mem/protocol/SConsopts	Wed Dec 02 17:08:48 2015 -0600
@@ -34,6 +34,7 @@
 
 all_protocols.extend([
     'VI_hammer',
+    'MOESI_hammer_bcu',
     ])
 
 protocol_dirs.append(str(Dir('.').abspath))
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449097728 21600
# Node ID 1b6fae7cb423ecdab1ce5ec545f97637e438f884
# Parent  7b001aa001f007c6af4ab6ddfcf2f3b491b108d9
Adds a simple permission table and uses it in the BCU

diff -r 7b001aa001f0 -r 1b6fae7cb423 configs/GPUConfig.py
--- a/configs/GPUConfig.py	Wed Dec 02 17:08:48 2015 -0600
+++ b/configs/GPUConfig.py	Wed Dec 02 17:08:48 2015 -0600
@@ -63,6 +63,8 @@
     parser.add_option("--gpu_tlb_entries", type="int", default=0, help="Number of entries in GPU TLB. 0 implies infinite")
     parser.add_option("--gpu_tlb_assoc", type="int", default=0, help="Associativity of the L1 TLB. 0 implies infinite")
     parser.add_option("--pwc_size", default="8kB", help="Capacity of the page walk cache")
+    parser.add_option("--plb_size", default=64, help="Entries in the PLB (Border Control Buffer/Cache")
+    parser.add_option("--plb_alignment", default=0, help="log of addresses per entry in PLB")
 
 def configureMemorySpaces(options):
     total_mem_range = AddrRange(options.total_mem_size)
diff -r 7b001aa001f0 -r 1b6fae7cb423 configs/fs_fusion.py
--- a/configs/fs_fusion.py	Wed Dec 02 17:08:48 2015 -0600
+++ b/configs/fs_fusion.py	Wed Dec 02 17:08:48 2015 -0600
@@ -151,8 +151,17 @@
                                         voltage_domain = system.voltage_domain)
 Ruby.create_system(options, system, system.iobus, system._dma_ports)
 
+
 system.gpu.ruby = system.ruby
 system.ruby.clk_domain = system.ruby_clk_domain
+if hasattr(system.ruby, "bcu_cntrl"):
+    system.gpu.shader_mmu.permission_table = system.ruby.bcu_cntrl.perm_table
+else:
+    system.gpu.shader_mmu.permission_table = PermissionTable()
+
+system.gpu.shader_mmu.permission_table.mem_size = options.total_mem_size
+system.gpu.shader_mmu.permission_table.plb_size = options.plb_size
+system.gpu.shader_mmu.permission_table.plb_alignment = options.plb_alignment
 
 #
 # Connect CPU ports
diff -r 7b001aa001f0 -r 1b6fae7cb423 configs/gpu_protocol/MOESI_hammer_bcu_fusion.py
--- a/configs/gpu_protocol/MOESI_hammer_bcu_fusion.py	Wed Dec 02 17:08:48 2015 -0600
+++ b/configs/gpu_protocol/MOESI_hammer_bcu_fusion.py	Wed Dec 02 17:08:48 2015 -0600
@@ -242,6 +242,7 @@
 
     # BCU
     cntrl = BorderControlUnit_Controller(version = 0,
+                                         perm_table = PermissionTable(),
                                          ruby_system = ruby_system)
     ruby_system.bcu_cntrl = cntrl
     topology.addController(cntrl)
diff -r 7b001aa001f0 -r 1b6fae7cb423 src/gpu/ShaderMMU.py
--- a/src/gpu/ShaderMMU.py	Wed Dec 02 17:08:48 2015 -0600
+++ b/src/gpu/ShaderMMU.py	Wed Dec 02 17:08:48 2015 -0600
@@ -32,6 +32,7 @@
 from m5.proxy import *
 from m5.util import fatal
 from ClockedObject import ClockedObject
+from PermissionTable import PermissionTable
 
 class ShaderMMU(ClockedObject):
     type = 'ShaderMMU'
@@ -52,6 +53,8 @@
     l2_tlb_entries = Param.Int(0, "Number of entries in the L2 TLB (0=>no L2)")
     l2_tlb_assoc = Param.Int(4, "Associativity of the L2 TLB (0 => full)")
 
+    permission_table = Param.PermissionTable(Parent.any, "Permission Table")
+
     prefetch_buffer_size = Param.Int(0, "Size of the prefetch buffer")
 
     def setUpPagewalkers(self, num, port, bypass_l1):
diff -r 7b001aa001f0 -r 1b6fae7cb423 src/gpu/shader_mmu.cc
--- a/src/gpu/shader_mmu.cc	Wed Dec 02 17:08:48 2015 -0600
+++ b/src/gpu/shader_mmu.cc	Wed Dec 02 17:08:48 2015 -0600
@@ -51,7 +51,8 @@
 ShaderMMU::ShaderMMU(const Params *p) :
     ClockedObject(p), pagewalkers(p->pagewalkers), latency(p->latency),
     outstandingFaultStatus(None), curOutstandingWalks(0),
-    prefetchBufferSize(p->prefetch_buffer_size)
+    prefetchBufferSize(p->prefetch_buffer_size),
+    permissionTable(p->permission_table)
 {
     activeWalkers.resize(pagewalkers.size());
     if (p->l2_tlb_entries > 0) {
@@ -91,6 +92,10 @@
         l2hits++;
         req->setPaddr(ppn + offset);
         req_tlb->insert(vpn, ppn);
+    if (permissionTable){
+        permissionTable->insert(ppn, (mode == BaseTLB::Write));
+        //TODO: add in mem write request if returns true
+    }
         translation->finish(NoFault, req, tc, mode);
         return;
     }
@@ -104,6 +109,10 @@
         if (tlb) {
             tlb->insert(vpn, ppn);
         }
+    if (permissionTable){
+        permissionTable->insert(ppn, (mode == BaseTLB::Write));
+        //TODO: add in mem write request if returns true
+    }
         req->setPaddr(ppn + offset);
         req_tlb->insert(vpn, ppn);
         translation->finish(NoFault, req, tc, mode);
@@ -209,6 +218,13 @@
     list<TranslationRequest*>::iterator it;
     list<TranslationRequest*> &walks = outstandingWalks[vpn];
     DPRINTF(ShaderMMU, "Walk satifies %d outstanding reqs\n", walks.size());
+    if (permissionTable && !translation->prefetch){
+        TheISA::TlbEntry *entry;
+        assert(translation->pageWalker);
+        entry = translation->pageWalker->lookup(vpn, false);
+        permissionTable->insert(ppn, entry->writable);
+        //TODO add in memory write request on miss?
+    }
     for (it = walks.begin(); it != walks.end(); it++) {
         TranslationRequest *t = (*it);
 
diff -r 7b001aa001f0 -r 1b6fae7cb423 src/gpu/shader_mmu.hh
--- a/src/gpu/shader_mmu.hh	Wed Dec 02 17:08:48 2015 -0600
+++ b/src/gpu/shader_mmu.hh	Wed Dec 02 17:08:48 2015 -0600
@@ -44,6 +44,7 @@
 #include "sim/clocked_object.hh"
 #include "sim/faults.hh"
 #include "sim/tlb.hh"
+#include "mem/ruby/PermissionTable.hh"
 
 class ShaderMMU : public ClockedObject
 {
@@ -151,6 +152,8 @@
     // Insert prefetch into prefetch buffer
     void insertPrefetch(Addr vpn, Addr ppn);
 
+    PermissionTable * permissionTable;
+
 public:
     /// Constructor
     typedef ShaderMMUParams Params;
diff -r 7b001aa001f0 -r 1b6fae7cb423 src/mem/protocol/MOESI_hammer_bcu-BCU.sm
--- a/src/mem/protocol/MOESI_hammer_bcu-BCU.sm	Wed Dec 02 17:08:48 2015 -0600
+++ b/src/mem/protocol/MOESI_hammer_bcu-BCU.sm	Wed Dec 02 17:08:48 2015 -0600
@@ -1,28 +1,64 @@
+/*
+ * Copyright (c) 2014 Mark D. Hill and David A. Wood
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * AMD's contributions to the MOESI hammer protocol do not constitute an 
+ * endorsement of its similarity to any AMD products.
+ *
+ * Authors: Lena Olson
+ *          Jason Power
+ */
 
-
+structure (PermissionTable, external="yes") {
+    void logAddress(Address, bool);
+    bool checkPLB(Address, bool);
+    bool checkTable(Address, bool);
+}
 
 machine(BorderControlUnit, "Border control unit")
-: Cycles latency := 1;
-
-    // NOTE: I'm pretty sure the directory to cache side is not necessary.
-    //       But we'll just never send messages to this controller over these
-    //       networks.
+:   PermissionTable * perm_table;
+    Cycles hit_latency := 10;
+    Cycles miss_latency := 100;
 
     // Interface with the cache (copied from MOESI_hammer-dir.sm)
-    // MessageBuffer * forwardFromDirToCache, network="To", virtual_network="3", ordered="false", vnet_type="forward";
-    // MessageBuffer * responseFromDirToCache, network="To", virtual_network="4", ordered="false", vnet_type="response";
-
-    MessageBuffer * unblockToDirFromCache, network="From", virtual_network="5", ordered="false", vnet_type="unblock";
-    MessageBuffer * responseToDirFromCache, network="From", virtual_network="4", ordered="false", vnet_type="response";
-    MessageBuffer * requestToDirFromCache, network="From", virtual_network="2", ordered="false", vnet_type="request", recycle_latency="1";
+    MessageBuffer * unblockToDirFromCache, network="From", virtual_network="5", 
+            ordered="false", vnet_type="unblock";
+    MessageBuffer * responseToDirFromCache, network="From", virtual_network="4",
+            ordered="false", vnet_type="response";
+    MessageBuffer * requestToDirFromCache, network="From", virtual_network="2",
+            ordered="false", vnet_type="request", recycle_latency="1";
 
     // Interface with the directory (copied from MOESI_hammer-cache.sm)
-    MessageBuffer * requestFromCacheToDir, network="To", virtual_network="2", ordered="false", vnet_type="request";
-    MessageBuffer * responseFromCacheToDir, network="To", virtual_network="4", ordered="false", vnet_type="response";
-    MessageBuffer * unblockFromCacheToDir, network="To", virtual_network="5", ordered="false", vnet_type="unblock";
+    MessageBuffer * requestFromCacheToDir, network="To", virtual_network="2",
+            ordered="false", vnet_type="request";
+    MessageBuffer * responseFromCacheToDir, network="To", virtual_network="4",
+            ordered="false", vnet_type="response";
+    MessageBuffer * unblockFromCacheToDir, network="To", virtual_network="5",
+            ordered="false", vnet_type="unblock";
 
-    // MessageBuffer * forwardToCacheFromDir, network="From", virtual_network="3", ordered="false", vnet_type="forward";
-    // MessageBuffer * responseToCacheFromDir, network="From", virtual_network="4", ordered="false", vnet_type="response";
 {
     
     state_declaration(State, desc="Cache states") {
@@ -30,10 +66,8 @@
     }
 
     enumeration(Event, desc="BCU events") {
-        //FwdFromDir;
         RespFromCache;
         UnblockFromCache;
-        //ResponseFromDir;
         RequestFromCache;
     }
 
@@ -57,8 +91,6 @@
     void setState(Address addr, State state) {
     }
 
-    //out_port(fwdNetwork_out, RequestMsg, forwardFromDirToCache);
-    //out_port(responseNetworkToCache_out, ResponseMsg, responseFromDirToCache);
     out_port(requestNetwork_out, RequestMsg, requestFromCacheToDir);
     out_port(responseNetworkToDir_out, ResponseMsg, responseFromCacheToDir);
     out_port(unblockNetwork_out, ResponseMsg, unblockFromCacheToDir);
@@ -76,14 +108,6 @@
         }
     }
     
-    // in_port(respFromDir_in, ResponseMsg, responseToCacheFromDir) {
-    //     if (respFromDir_in.isReady()) {
-    //         peek(respFromDir_in, ResponseMsg) {
-    //             trigger(Event:ResponseFromDir, in_msg.Addr);
-    //         }
-    //     }
-    // }
-    
     in_port(respFromCache_in, ResponseMsg, responseToDirFromCache) {
         if (respFromCache_in.isReady()) {
             peek(respFromCache_in, ResponseMsg) {
@@ -99,37 +123,34 @@
             }
         }
     }
-    
-    // in_port(fwdFromDir_in, ResponseMsg, forwardToCacheFromDir) {
-    //     if (fwdFromDir_in.isReady()) {
-    //         peek(fwdFromDir_in, ResponseMsg) {
-    //             trigger(Event:FwdFromDir, in_msg.Addr);
-    //         }
-    //     }
-    // }
-
-    // action(ff_fwdtocache, "ff", desc="FwdFromDir") {
-    //     peek(fwdFromDir_in, ResponseMsg) {
-    //         enqueue(fwdNetwork_out, ResponseMsg, latency) {
-    //             out_msg.Addr := in_msg.Addr;
-    //             out_msg.Type := in_msg.Type;
-    //             out_msg.Sender := in_msg.Sender;
-    //             out_msg.CurOwner := in_msg.CurOwner;
-    //             out_msg.Destination := in_msg.OriginalDestination;
-    //             out_msg.DataBlk := in_msg.DataBlk;
-    //             out_msg.Dirty := in_msg.Dirty;
-    //             out_msg.Acks := in_msg.Acks;
-    //             out_msg.MessageSize := in_msg.MessageSize;
-    //             out_msg.InitialRequestTime := in_msg.InitialRequestTime;
-    //             out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
-    //             out_msg.SilentAcks := in_msg.SilentAcks;
-    //         }
-    //     }
-    // }
 
     action(rsd_resptodir, "rtd", desc="RespFromCache") {
         peek(respFromCache_in, ResponseMsg) {
             DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr);
+            Cycles latency := hit_latency;
+            /*Do we need read permission*/
+            if (in_msg.Type == CoherenceResponseType:ACK_SHARED){
+                perm_table.logAddress(in_msg.Addr, false);
+                bool ret := perm_table.checkPLB(in_msg.Addr, false);
+                if (ret != true) {
+                    perm_table.checkTable(in_msg.Addr, false);
+                    latency := miss_latency;
+                }
+            }
+            /*Do we need write permission*/
+            else if (in_msg.Type == CoherenceResponseType:DATA ||
+		     in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE ||
+		     in_msg.Type == CoherenceResponseType:DATA_SHARED){
+		/*With this coherence protocol, hard to tell if this was an
+		innocent read that got O or what. Use dirty bit.*/
+                perm_table.logAddress(in_msg.Addr, true);
+                bool ret := perm_table.checkPLB(in_msg.Addr, in_msg.Dirty);
+                if (ret != true) {
+                    perm_table.checkTable(in_msg.Addr, true);
+                    latency := miss_latency;
+                }
+            }
+
             enqueue(responseNetworkToDir_out, ResponseMsg, latency) {
                 out_msg.Addr := in_msg.Addr;
                 out_msg.Type := in_msg.Type;
@@ -147,29 +168,34 @@
         }
     }
 
-    // action(rsc_resptocache, "rsc", desc="ResponseFromDir") {
-    //     peek(respFromDir_in, ResponseMsg) {
-    //         enqueue(responseNetworkToCache_out, ResponseMsg, latency) {
-    //             out_msg.Addr := in_msg.Addr;
-    //             out_msg.Type := in_msg.Type;
-    //             out_msg.Sender := in_msg.Sender;
-    //             out_msg.CurOwner := in_msg.CurOwner;
-    //             out_msg.Destination := in_msg.OriginalDestination;
-    //             out_msg.DataBlk := in_msg.DataBlk;
-    //             out_msg.Dirty := in_msg.Dirty;
-    //             out_msg.Acks := in_msg.Acks;
-    //             out_msg.MessageSize := in_msg.MessageSize;
-    //             out_msg.InitialRequestTime := in_msg.InitialRequestTime;
-    //             out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
-    //             out_msg.SilentAcks := in_msg.SilentAcks;
-    //         }
-    //     }
-    // }
-
     action(uc_unblocktodir, "uc", desc="UnblockFromCache") {
         peek(unblock_in, ResponseMsg) {
             DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr);
-            enqueue(unblockNetwork_out, ResponseMsg, latency) {
+            Cycles latency := hit_latency;
+            /*Do we need read permission*/
+            if (in_msg.Type == CoherenceResponseType:UNBLOCK ||
+                in_msg.Type == CoherenceResponseType:UNBLOCKS ||
+		in_msg.Type == CoherenceResponseType:WB_CLEAN ||
+		in_msg.Type == CoherenceResponseType:UNBLOCKM ||
+		in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_CLEAN){ 
+                perm_table.logAddress(in_msg.Addr, false);
+                bool ret := perm_table.checkPLB(in_msg.Addr, false);
+                if (ret != true) {
+                    perm_table.checkTable(in_msg.Addr, false);
+                    latency := miss_latency;
+                }
+            }
+            /*Do we need write permission*/
+            else if (in_msg.Type == CoherenceResponseType:WB_DIRTY ||
+		     in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_DIRTY){
+                perm_table.logAddress(in_msg.Addr, true);
+                bool ret := perm_table.checkPLB(in_msg.Addr, true);
+                if (ret != true) {
+                    perm_table.checkTable(in_msg.Addr, true);
+                    latency := miss_latency;
+                }
+            }
+	    enqueue(unblockNetwork_out, ResponseMsg, latency) {
                 out_msg.Addr := in_msg.Addr;
                 out_msg.Type := in_msg.Type;
                 out_msg.Sender := in_msg.Sender;
@@ -189,6 +215,29 @@
     action(rqc_reqtodir, "rqc", desc="RequestFromCache") {
         peek(requestFromCache_in, RequestMsg) {
             DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr);
+	    Cycles latency := hit_latency;
+	    /*Do we need read permission*/
+	    if (in_msg.Type == CoherenceRequestType:GETX || 
+		in_msg.Type == CoherenceRequestType:GETS || 
+		in_msg.Type == CoherenceRequestType:MERGED_GETS || 
+		in_msg.Type == CoherenceRequestType:GETF){
+		perm_table.logAddress(in_msg.Addr, false);
+		bool ret := perm_table.checkPLB(in_msg.Addr, false);
+		if (ret != true) {
+		    perm_table.checkTable(in_msg.Addr, false);
+		    latency := miss_latency;
+		}
+	    }
+	    /*Do we need write permission*/
+	    else if (in_msg.Type == CoherenceRequestType:PUT || 
+		     in_msg.Type == CoherenceRequestType:PUTF){
+		perm_table.logAddress(in_msg.Addr, true);
+		bool ret := perm_table.checkPLB(in_msg.Addr, true);
+		if (ret != true) {
+		    perm_table.checkTable(in_msg.Addr, true);
+		    latency := miss_latency;
+		}
+	    }
             enqueue(requestNetwork_out, RequestMsg, latency) {
                 out_msg.Addr := in_msg.Addr;
                 out_msg.Type := in_msg.Type;
@@ -208,10 +257,6 @@
         unblock_in.dequeue();
     }
 
-    // action(prfd_poprespFromDir, "prfd", desc="") {
-    //     respFromDir_in.dequeue();
-    // }
-
     action(prfc_poprespFromCache, "prfc", desc="") {
         respFromCache_in.dequeue();
     }
@@ -220,16 +265,6 @@
         requestFromCache_in.dequeue();
     }
 
-    // action(pf_popfwdFromDir, "pf", desc="") {
-    //     fwdFromDir_in.dequeue();
-    // }
-
-
-    // transition({I}, {FwdFromDir}) {
-    //     ff_fwdtocache;
-    //     pf_popfwdFromDir;
-    // }
-
     transition({I}, {RespFromCache}) {
         rsd_resptodir;
         prfc_poprespFromCache;
@@ -240,13 +275,8 @@
         pu_popunblock;
     }
 
-    // transition({I}, {ResponseFromDir}) {
-    //     rsc_resptocache;
-    //     prfd_poprespFromDir;
-    // }
-
     transition({I}, {RequestFromCache}) {
         rqc_reqtodir;
         pq_popreqFromCache;
     }
-}
\ No newline at end of file
+}
diff -r 7b001aa001f0 -r 1b6fae7cb423 src/mem/protocol/SConsopts
--- a/src/mem/protocol/SConsopts	Wed Dec 02 17:08:48 2015 -0600
+++ b/src/mem/protocol/SConsopts	Wed Dec 02 17:08:48 2015 -0600
@@ -40,3 +40,4 @@
 protocol_dirs.append(str(Dir('.').abspath))
 
 slicc_includes.append('mem/ruby/RubySlicc_GPUMappings.hh')
+slicc_includes.append('mem/ruby/PermissionTable.hh')
diff -r 7b001aa001f0 -r 1b6fae7cb423 src/mem/ruby/PermissionTable.cc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/mem/ruby/PermissionTable.cc	Wed Dec 02 17:08:48 2015 -0600
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2014 Mark D. Hill and David A. Wood
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Lena Olson, Jason Power
+ *
+ */
+
+
+#include "debug/PermissionTable.hh"
+#include "PermissionTable.hh"
+
+PermissionTable::PermissionTable(const Params *p) :
+        ClockedObject(p), mem_size(p->mem_size), plb_size(p->plb_size),
+        plb_alignment(p->plb_alignment)
+{
+    page_shift = 12; //because I'm too dumb to use TheISA
+
+    // init based on params giving size of mem
+
+    // bitmap has 2 bits per page (page is 4kB)
+    permission_bitmap = std::vector<bool>((mem_size >> page_shift) * 2, false);
+
+    // PLB is currently a list, so we don't have to touch that
+
+    plb_entry_size = 1 << plb_alignment;
+    printf("plb alignment is %lu, plb_entry_size is %lu\n", plb_alignment, plb_entry_size);
+}
+
+void PermissionTable::logAddress(Address addr, bool isWrite)
+{
+        DPRINTF(PermissionTable, "Addr %#x, %s\n", addr.getAddress(), isWrite ? "W" : "R");
+}
+
+/* After ATS translates address, insert it into permission table & cache.
+ * Should only be called from ATS (shaderMMU)
+ * Returns true if anything was inserted, since then a memory request
+ * should be generated. */
+/* addr should be full-length address*/
+bool PermissionTable::insert(Addr addr, bool hasWrite)
+{
+    // first check that the address is valid
+    assert((addr & 0x7f) == 0);
+    assert (addr < mem_size);
+    Address plb_tag = getPLBTag(Address(addr));
+    Addr ppn = addr >> page_shift;
+    uint32_t offset = (ppn % plb_entry_size) * 2;
+
+    plb_entry entry = popPLBEntry(plb_tag);
+    if (entry.first.getAddress() == 0){
+        //not in PLB - insert it
+        hasWrite ? m_plb_write_insert_miss++ : m_plb_read_insert_miss++;
+        if (plb_size > 0){
+            //update with this entry
+            entry.first = plb_tag;
+            //initialize vector
+            entry.second = std::vector<bool>(plb_entry_size * 2, false);
+            entry.second[offset] = true;
+            entry.second[offset + 1] = hasWrite;
+
+            //add it to PLB
+            plb.push_front(entry);
+
+            while (plb.size() > plb_size){ //PLB is full
+                plb.pop_back();
+                m_plb_evict++;
+            }
+        }
+        // finally, insert into table
+        writeTable(Address(addr), hasWrite);
+
+        return true; //so slicc can generate mem req
+    }
+
+    assert(plb_alignment || entry.second[offset]);
+    // is the block already in the plb? If so, push it to the front
+    // check permissions
+    if (!entry.second[offset] || (hasWrite && !entry.second[offset+1])){
+        hasWrite ? m_plb_write_partial_hit++ : m_plb_read_partial_hit++;
+        //bad permissions - update entry
+        entry.second[offset] = true;
+        entry.second[offset+1] = hasWrite;
+        //update table
+        writeTable(Address(addr), hasWrite);
+    }
+    else {
+        hasWrite ? m_plb_write_insert_hit++ : m_plb_read_insert_hit++;
+    }
+
+    //Add entry back to front of list
+    plb.push_front(entry);
+    return false;
+}
+
+/* Expects that page_addr is full-length address*/
+bool PermissionTable::checkPLB(Address addr, bool isWriteback)
+{
+    if (plb_size <= 0){
+        return false;
+    }
+    Address plb_tag = getPLBTag(addr);
+    Addr ppn = addr.shiftLowOrderBits(page_shift);
+    uint32_t offset = (ppn % plb_entry_size) * 2;
+
+    plb_entry entry = popPLBEntry(plb_tag);
+    if (entry.first.getAddress() == 0){
+        isWriteback ? m_plb_write_miss++ : m_plb_read_miss++;
+        return false;
+    }
+
+    if (isWriteback && !entry.second[offset+1]){
+        printf("lena: Writeback for non-writable block? %llx\n", addr.getAddress());
+        //In this case, we want to check the table
+        m_plb_write_mismatch++;
+        return false;
+    }
+
+    isWriteback ? m_plb_write_hit++ : m_plb_read_hit++;
+    //update LRU
+    plb.push_front(entry);
+    return true;
+}
+
+/*Helper functions for dealing with PLB*/
+/* requires pre-shifted address (tag) */
+PermissionTable::plb_entry PermissionTable::popPLBEntry(Address plb_tag){
+    plb_entry ret;
+    ret.first = Address(0);
+    //get addr aligned
+    for (auto it = plb.begin(); it != plb.end(); it++){
+        if (it->first == plb_tag){
+            ret = *it;
+            plb.erase(it);
+            return ret;
+        }
+    }
+    return ret;
+}
+
+Address PermissionTable::getPLBTag(Address addr){
+    return Address(addr.shiftLowOrderBits(plb_alignment + page_shift));
+}
+
+/* Requires full-length address*/
+bool PermissionTable::checkTable(Address addr, bool isWriteback)
+{
+    Address plb_tag = getPLBTag(addr);
+    m_table_read++;
+    bool writeable = checkTableWrite(addr);
+    bool readable = checkTableRead(addr);
+    if (!readable || (isWriteback && !writeable)){
+        printf("lena: Warning: checkTable failed for %llx %s\n", addr.getAddress(), isWriteback ? "W" : "R");
+    }
+
+    if (plb_size > 0) {
+        //update the PLB
+        //Get the corresponding entry from the table
+        plb_entry entry = getTableBlock(plb_tag);
+
+        plb.push_front(entry);
+
+        while (plb.size() > plb_size){ //PLB is full
+            plb.pop_back();
+            m_plb_evict++;
+        }
+    }
+
+    return isWriteback ? writeable : readable;
+}
+
+
+/* Helper functions for dealing with the table */
+void PermissionTable::writeTable(Address addr, bool hasWrite){
+    m_table_write++;
+    Addr ppn = addr.shiftLowOrderBits(page_shift);
+    permission_bitmap[(ppn*2)] = true;
+    permission_bitmap[(ppn*2)+1] = permission_bitmap[(ppn*2)+1] | hasWrite;
+}
+
+bool PermissionTable::checkTableRead(Address addr){
+    Addr ppn = addr.shiftLowOrderBits(page_shift);
+    return permission_bitmap[(ppn * 2)];
+}
+
+bool PermissionTable::checkTableWrite(Address addr){
+    Addr ppn = addr.shiftLowOrderBits(page_shift);
+    return permission_bitmap[(ppn * 2) + 1];
+}
+
+Address PermissionTable::getTableAddress(Address addr){
+    assert(false); // can't remember why I wrote this function
+    //get byte address
+    Address a(addr.shiftLowOrderBits(page_shift) / 4);
+    //make into block address
+    a.makeLineAddress();
+    return a;
+}
+
+PermissionTable::plb_entry PermissionTable::getTableBlock(Address plb_tag){
+    //printf("Get Table Block %llx\n", plb_tag.getAddress());
+    //plb_tag is the first bits of the ppn; need to refill 0s
+    Addr ppn = plb_tag.getAddress() << plb_alignment;
+    plb_entry ret;
+    ret.first = plb_tag;
+
+#if 0
+    printf("ppn*2 is %lx, (ppn+plb_entry_size)*2 is %lx\n", ppn*2, (ppn+plb_entry_size)*2);
+
+    auto table_first = permission_bitmap.begin();
+    auto table_last = permission_bitmap.begin();
+    std::advance(table_first, ppn*2);
+    std::advance(table_last, (ppn+plb_entry_size)*2);
+
+    ret.second.resize(plb_entry_size*2, 0);
+
+    //do the copy
+    std::copy(table_first, table_last, ret.second.begin());
+    assert(ret.second.size() == plb_entry_size*2);
+#endif
+
+    for (int i = 0; i < plb_entry_size*2; i++){
+        ret.second.push_back(permission_bitmap[ppn*2+i]);
+    }
+    assert(ret.second.size() == plb_entry_size*2);
+    return ret;
+}
+
+// STATS LIVE HERE
+void
+PermissionTable::regStats()
+{
+
+    m_plb_read_hit
+        .name(name() + ".plb_read_hit")
+        .desc("Number of PLB read hits")
+        ;
+
+    m_plb_write_hit
+        .name(name() + ".plb_write_hit")
+        .desc("Number of PLB write hits")
+        ;
+
+    m_plb_read_miss
+        .name(name() + ".plb_read_miss")
+        .desc("Number of PLB read misses")
+        ;
+
+    m_plb_write_miss
+        .name(name() + ".plb_write_miss")
+        .desc("Number of PLB write misses")
+        ;
+
+    m_plb_read_insert_hit
+        .name(name() + ".plb_read_insert_hit")
+        .desc("Number of PLB insert with read hits")
+        ;
+
+    m_plb_write_insert_hit
+        .name(name() + ".plb_write_insert_hit")
+        .desc("Number of PLB insert with write hits")
+        ;
+
+    m_plb_read_insert_miss
+        .name(name() + ".plb_read_insert_miss")
+        .desc("Number of PLB insert with read misses")
+        ;
+
+    m_plb_write_insert_miss
+        .name(name() + ".plb_write_insert_miss")
+        .desc("Number of PLB insert with write misses")
+        ;
+
+    m_plb_read_partial_hit
+        .name(name() + ".plb_read_partial_hit")
+        .desc("Read insert with block hit, page miss")
+        ;
+
+    m_plb_write_partial_hit
+        .name(name() + ".plb_write_partial_hit")
+        .desc("Write insert with block hit, page miss")
+        ;
+
+
+
+    m_plb_write_mismatch
+        .name(name() + ".plb_write_mismatch")
+        .desc("Number of times we missed plb_write checks")
+        ;
+
+    m_plb_evict
+        .name(name() + ".plb_evict")
+        .desc("Number of PLB evictions")
+        ;
+
+    m_table_read
+        .name(name() + ".table_read")
+        .desc("Number of permission table reads")
+        ;
+
+    m_table_write
+        .name(name() + ".table_write")
+        .desc("Number of permission table writes")
+        ;
+}
+
+PermissionTable *
+PermissionTableParams::create()
+{
+        return new PermissionTable(this);
+}
+
diff -r 7b001aa001f0 -r 1b6fae7cb423 src/mem/ruby/PermissionTable.hh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/mem/ruby/PermissionTable.hh	Wed Dec 02 17:08:48 2015 -0600
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2012-2013 Mark D. Hill and David A. Wood
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Lena Olson, Jason Power
+ *
+ */
+
+#ifndef __PERMISSION_TABLE_HH__
+#define __PERMISSION_TABLE_HH__
+
+#include "base/statistics.hh"
+#include "config/the_isa.hh"
+#include "mem/ruby/common/Address.hh"
+#include "mem/ruby/system/System.hh"
+#include "params/PermissionTable.hh"
+#include "sim/clocked_object.hh"
+
+class PermissionTable : public ClockedObject
+{
+  protected:
+    typedef PermissionTableParams Params;
+
+  public:
+    PermissionTable(const Params *p);
+
+    void logAddress(Address addr, bool isWrite);
+
+    bool insert(Addr page_addr, bool hasWrite);
+
+    bool checkPLB(Address addr, bool isWriteback);
+
+    bool checkTable(Address addr, bool isWriteback);
+
+    // What 64-byte block is this address found in?
+    Address getTableAddress(Address addr);
+
+
+  private:
+
+    uint64_t mem_size;
+    uint32_t plb_size;
+    uint64_t plb_alignment; //if x, 2^x addresses per entry
+    uint64_t plb_entry_size;
+
+    // Why is this hardcoded? Easier to manage storage when we only know size
+    typedef std::pair<Address, std::vector<bool> > plb_entry;
+
+    // cache / buffer (s) (address, write-permission)
+    // at runtime.
+    std::list<plb_entry> plb;
+
+
+    // bitmap: pairs of read-write bits. Read is first.
+    std::vector<bool> permission_bitmap;
+
+
+    // PLB helper functions
+    plb_entry popPLBEntry(Address addr);
+    Address getPLBTag(Address addr);
+
+    // table (bitmap) helper functions
+    void writeTable(Address addr, bool hasWrite);
+    bool checkTableRead(Address addr);
+    bool checkTableWrite(Address addr);
+    plb_entry getTableBlock(Address addr);
+
+    uint64_t page_shift; //can't get ISA to work
+
+
+
+
+    // STATS LIVE HERE
+  public:
+    void regStats();
+
+    Stats::Scalar m_plb_read_hit;
+    Stats::Scalar m_plb_write_hit;
+    Stats::Scalar m_plb_read_miss;
+    Stats::Scalar m_plb_write_miss;
+
+    Stats::Scalar m_plb_read_insert_hit;
+    Stats::Scalar m_plb_write_insert_hit;
+    Stats::Scalar m_plb_read_insert_miss;
+    Stats::Scalar m_plb_write_insert_miss;
+    Stats::Scalar m_plb_read_partial_hit;
+    Stats::Scalar m_plb_write_partial_hit;
+
+    Stats::Scalar m_plb_evict;
+    Stats::Scalar m_plb_write_mismatch;
+
+    Stats::Scalar m_table_read;
+    Stats::Scalar m_table_write;
+
+};
+
+#endif // __PERMISSION_TABLE_HH__
diff -r 7b001aa001f0 -r 1b6fae7cb423 src/mem/ruby/PermissionTable.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/mem/ruby/PermissionTable.py	Wed Dec 02 17:08:48 2015 -0600
@@ -0,0 +1,40 @@
+# Copyright (c) 2014 Mark D. Hill and David A. Wood
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Jason Power
+#		   Lena Olson
+#
+
+from ClockedObject import ClockedObject
+from m5.params import *
+
+class PermissionTable(ClockedObject):
+    type = 'PermissionTable'
+    cxx_class = 'PermissionTable'
+    cxx_header = "src/mem/ruby/PermissionTable.hh"
+    mem_size = Param.MemorySize("1GB", "physical memory capacity in bytes")
+    plb_size = Param.Int(64, "PLB size in bytes")
+    plb_alignment = Param.Int(0, "Log2 of # addresses per PLB entry")
diff -r 7b001aa001f0 -r 1b6fae7cb423 src/mem/ruby/SConscript
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/mem/ruby/SConscript	Wed Dec 02 17:08:48 2015 -0600
@@ -0,0 +1,57 @@
+# -*- mode:python -*-
+
+# Copyright (c) 2011 Mark D. Hill and David A. Wood
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+Import('*')
+
+SimObject('PermissionTable.py')
+
+Source('PermissionTable.cc')
+
+DebugFlag('PermissionTable')
+
+from os.path import basename, isdir, join as joinpath
+
+#
+# Link includes for SLICC ('cause it's dumb!!)
+#
+generated_dir = Dir('../../mem/protocol')
+
+def MakeIncludeAction(target, source, env):
+    f = file(str(target[0]), 'w')
+    for s in source:
+        print >>f, '#include "%s"' % str(s.abspath)
+    f.close()
+
+def MakeInclude(source):
+    target = generated_dir.File(basename(source))
+    include_action1 = MakeAction(MakeIncludeAction, Transform("MAKE INC", 1))
+    env.Command(target, source, include_action1)
+
+# Since this is a SLICC external object
+MakeInclude('PermissionTable.hh')
\ No newline at end of file
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449097729 21600
# Node ID 4c279b99f8f872610c293d4581e2849ccbae9827
# Parent  1b6fae7cb423ecdab1ce5ec545f97637e438f884
imported patch extend-perm-table-VI

diff -r 1b6fae7cb423 -r 4c279b99f8f8 configs/gpu_protocol/VI_hammer.py
--- a/configs/gpu_protocol/VI_hammer.py	Wed Dec 02 17:08:48 2015 -0600
+++ b/configs/gpu_protocol/VI_hammer.py	Wed Dec 02 17:08:49 2015 -0600
@@ -224,6 +224,6 @@
 
         # Connect the dma controller to the network
         dma_cntrl.responseFromDir = ruby_system.network.master
-        dma_cntrl.requestToDir = ruby_system.network.slave
+        dma_cntrl.reqToDirectory = ruby_system.network.slave
 
     return (cpu_sequencers, dir_cntrl_nodes, dma_cntrl_nodes, topology)
diff -r 1b6fae7cb423 -r 4c279b99f8f8 configs/gpu_protocol/VI_hammer_bcu.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/configs/gpu_protocol/VI_hammer_bcu.py	Wed Dec 02 17:08:49 2015 -0600
@@ -0,0 +1,8 @@
+
+def define_options(parser):
+    parser.add_option("--allow-atomic-migration", action="store_true",
+          help="allow migratory sharing for atomic only accessed blocks")
+    parser.add_option("--pf-on", action="store_true",
+          help="Hammer: enable Probe Filter")
+    parser.add_option("--dir-on", action="store_true",
+          help="Hammer: enable Full-bit Directory")
\ No newline at end of file
diff -r 1b6fae7cb423 -r 4c279b99f8f8 configs/gpu_protocol/VI_hammer_bcu_fusion.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/configs/gpu_protocol/VI_hammer_bcu_fusion.py	Wed Dec 02 17:08:49 2015 -0600
@@ -0,0 +1,297 @@
+# Copyright (c) 2006-2007 The Regents of The University of Michigan
+# Copyright (c) 2009 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Brad Beckmann
+
+import math
+import m5
+import VI_hammer
+from m5.objects import *
+from m5.defines import buildEnv
+from Cluster import Cluster
+
+#
+# Note: the L1 Cache latency is only used by the sequencer on fast path hits
+#
+class L1Cache(RubyCache):
+    latency = 1
+
+#
+# Note: the L2 Cache latency is not currently used
+#
+class L2Cache(RubyCache):
+    latency = 15
+
+def create_system(options, system, dma_devices, ruby_system):
+
+    if not buildEnv['GPGPU_SIM']:
+        m5.util.panic("This script requires GPGPU-Sim integration to be built.")
+
+    # Run the protocol script to setup CPU cluster, directory and DMA
+    (all_sequencers, dir_cntrls, dma_cntrls, cpu_cluster) = \
+                                        VI_hammer.create_system(options,
+                                                                system,
+                                                                dma_devices,
+                                                                ruby_system)
+
+    cpu_cntrl_count = len(cpu_cluster) + len(dir_cntrls)
+
+    #
+    # Build GPU cluster
+    #
+    gpu_cluster = Cluster(intBW = 32, extBW = 32)
+    gpu_cluster.disableConnectToParent()
+
+    l2_bits = int(math.log(options.num_l2caches, 2))
+    block_size_bits = int(math.log(options.cacheline_size, 2))
+    # This represents the L1 to L2 interconnect latency
+    # NOTE! This latency is in Ruby (cache) cycles, not SM cycles
+    per_hop_interconnect_latency = 45 # ~15 GPU cycles
+    num_dance_hall_hops = int(math.log(options.num_sc, 2))
+    if num_dance_hall_hops == 0:
+        num_dance_hall_hops = 1
+    l1_to_l2_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops
+
+    #
+    # Caches for GPU cores
+    #
+    for i in xrange(options.num_sc):
+        #
+        # First create the Ruby objects associated with the GPU cores
+        #
+        cache = L1Cache(size = options.sc_l1_size,
+                            assoc = options.sc_l1_assoc,
+                            replacement_policy = "LRU",
+                            start_index_bit = block_size_bits,
+                            dataArrayBanks = 4,
+                            tagArrayBanks = 4,
+                            dataAccessLatency = 4,
+                            tagAccessLatency = 4,
+                            resourceStalls = False)
+
+        l1_cntrl = GPUL1Cache_Controller(version = i,
+                                  cache = cache,
+                                  l2_select_num_bits = l2_bits,
+                                  num_l2 = options.num_l2caches,
+                                  issue_latency = l1_to_l2_noc_latency,
+                                  number_of_TBEs = options.gpu_l1_buf_depth,
+                                  ruby_system = ruby_system)
+
+        gpu_seq = RubySequencer(version = options.num_cpus + i,
+                            icache = cache,
+                            dcache = cache,
+                            access_phys_mem = True,
+                            max_outstanding_requests = options.gpu_l1_buf_depth,
+                            ruby_system = ruby_system,
+                            deadlock_threshold = 2000000,
+                            connect_to_io = False)
+
+        l1_cntrl.sequencer = gpu_seq
+
+        exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i)
+
+        #
+        # Add controllers and sequencers to the appropriate lists
+        #
+        all_sequencers.append(gpu_seq)
+        gpu_cluster.add(l1_cntrl)
+
+        # Connect the controller to the network
+        l1_cntrl.requestFromL1Cache = ruby_system.network.slave
+        l1_cntrl.atomicRequestFromL1Cache = ruby_system.network.slave
+        l1_cntrl.responseToL1Cache = ruby_system.network.master
+
+    l2_index_start = block_size_bits + l2_bits
+    # Use L2 cache and interconnect latencies to calculate protocol latencies
+    # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles
+    l2_cache_access_latency = 30 # ~10 GPU cycles
+    l2_to_l1_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops
+    l2_to_mem_noc_latency = 125 # ~40 GPU cycles
+
+    l2_clusters = []
+    for i in xrange(options.num_l2caches):
+        #
+        # First create the Ruby objects associated with this cpu
+        #
+        l2_cache = L2Cache(size = options.sc_l2_size,
+                           assoc = options.sc_l2_assoc,
+                           start_index_bit = l2_index_start,
+                           replacement_policy = "LRU",
+                           dataArrayBanks = 4,
+                           tagArrayBanks = 4,
+                           dataAccessLatency = 4,
+                           tagAccessLatency = 4,
+                           resourceStalls = options.gpu_l2_resource_stalls)
+
+        l2_cntrl = GPUL2Cache_Controller(version = i,
+                                L2cache = l2_cache,
+                                l2_response_latency = l2_cache_access_latency +
+                                                      l2_to_l1_noc_latency,
+                                l2_request_latency = l2_to_mem_noc_latency,
+                                ruby_system = ruby_system)
+
+        exec("ruby_system.l2_cntrl%d = l2_cntrl" % i)
+        l2_cluster = Cluster(intBW = 32, extBW = 32)
+        l2_cluster.add(l2_cntrl)
+        gpu_cluster.add(l2_cluster)
+        l2_clusters.append(l2_cluster)
+
+        # Connect the controller to the network
+        l2_cntrl.responseToL1Cache = ruby_system.network.slave
+        l2_cntrl.requestFromCache = ruby_system.network.slave
+        l2_cntrl.responseFromCache = ruby_system.network.slave
+        l2_cntrl.unblockFromCache = ruby_system.network.slave
+
+        l2_cntrl.requestFromL1Cache = ruby_system.network.master
+        l2_cntrl.atomicRequestFromL1Cache = ruby_system.network.master
+        l2_cntrl.forwardToCache = ruby_system.network.master
+        l2_cntrl.responseToCache = ruby_system.network.master
+
+    ############################################################################
+    # Pagewalk cache
+    # NOTE: We use a CPU L1 cache controller here. This is to facilatate MMU
+    #       cache coherence (as the GPU L1 caches are incoherent without flushes
+    #       The L2 cache is small, and should have minimal affect on the
+    #       performance (see Section 6.2 of Power et al. HPCA 2014).
+    pwd_cache = L1Cache(size = options.pwc_size,
+                            assoc = 16, # 64 is fully associative @ 8kB
+                            replacement_policy = "LRU",
+                            start_index_bit = block_size_bits,
+                            latency = 8,
+                            resourceStalls = False)
+    # Small cache since CPU L1 requires I and D
+    pwi_cache = L1Cache(size = "512B",
+                            assoc = 2,
+                            replacement_policy = "LRU",
+                            start_index_bit = block_size_bits,
+                            latency = 8,
+                            resourceStalls = False)
+
+    # Small cache since CPU L1 controller requires L2
+    l2_cache = L2Cache(size = "512B",
+                           assoc = 2,
+                           start_index_bit = block_size_bits,
+                           latency = 1,
+                           resourceStalls = False)
+
+    l1_cntrl = L1Cache_Controller(version = options.num_cpus,
+                                  L1Icache = pwi_cache,
+                                  L1Dcache = pwd_cache,
+                                  L2cache = l2_cache,
+                                  send_evictions = False,
+                                  issue_latency = l1_to_l2_noc_latency,
+                                  cache_response_latency = 1,
+                                  l2_cache_hit_latency = 1,
+                                  number_of_TBEs = options.gpu_l1_buf_depth,
+                                  ruby_system = ruby_system)
+
+    cpu_seq = RubySequencer(version = options.num_cpus + options.num_sc,
+                            icache = pwd_cache, # Never get data from pwi_cache
+                            dcache = pwd_cache,
+                            access_phys_mem = True,
+                            max_outstanding_requests = options.gpu_l1_buf_depth,
+                            ruby_system = ruby_system,
+                            deadlock_threshold = 2000000,
+                            connect_to_io = False)
+
+    l1_cntrl.sequencer = cpu_seq
+
+
+    ruby_system.l1_pw_cntrl = l1_cntrl
+    all_sequencers.append(cpu_seq)
+
+    gpu_cluster.add(l1_cntrl)
+
+    # Connect the L1 controller and the network
+    # Connect the buffers from the controller to network
+    l1_cntrl.requestFromCache = ruby_system.network.slave
+    l1_cntrl.responseFromCache = ruby_system.network.slave
+    l1_cntrl.unblockFromCache = ruby_system.network.slave
+
+    # Connect the buffers from the network to the controller
+    l1_cntrl.forwardToCache = ruby_system.network.master
+    l1_cntrl.responseToCache = ruby_system.network.master
+
+
+    #
+    # Create controller for the copy engine to connect to in GPU cluster
+    # Cache is unused by controller
+    #
+    cache = L1Cache(size = "4096B", assoc = 2)
+
+    gpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc+1,
+                               icache = cache,
+                               dcache = cache,
+                               access_phys_mem = True,
+                               max_outstanding_requests = 64,
+                               support_inst_reqs = False,
+                               ruby_system = ruby_system,
+                               connect_to_io = False)
+
+    gpu_ce_cntrl = GPUCopyDMA_Controller(version = 0,
+                                  sequencer = gpu_ce_seq,
+                                  number_of_TBEs = 256,
+                                  ruby_system = ruby_system)
+
+    ruby_system.l1_cntrl_ce = gpu_ce_cntrl
+
+    all_sequencers.append(gpu_ce_seq)
+
+    gpu_ce_cntrl.responseFromDir = ruby_system.network.master
+    gpu_ce_cntrl.reqToDirectory = ruby_system.network.slave
+
+    # BCU
+    bcu_cntrl = BorderControlUnit_Controller(version = 0,
+                                         perm_table = PermissionTable(),
+                                         ruby_system = ruby_system)
+    ruby_system.bcu_cntrl = bcu_cntrl
+
+    bcu_cntrl.unblockToDirFromCache = ruby_system.network.master
+    bcu_cntrl.responseToDirFromCache = ruby_system.network.master
+    bcu_cntrl.requestToDirFromCache = ruby_system.network.master
+
+    # Connect the buffers from the network to the controller
+    bcu_cntrl.requestFromCacheToDir = ruby_system.network.slave
+    bcu_cntrl.responseFromCacheToDir = ruby_system.network.slave
+    bcu_cntrl.unblockFromCacheToDir = ruby_system.network.slave
+
+    complete_cluster = Cluster(intBW = 32, extBW = 32)
+    complete_cluster.add(bcu_cntrl)
+    complete_cluster.add(gpu_ce_cntrl)
+    complete_cluster.add(cpu_cluster)
+    complete_cluster.add(gpu_cluster)
+
+    for cntrl in dir_cntrls:
+        complete_cluster.add(cntrl)
+
+    for cntrl in dma_cntrls:
+        complete_cluster.add(cntrl)
+
+    for cluster in l2_clusters:
+        complete_cluster.add(cluster)
+
+    return (all_sequencers, dir_cntrls, complete_cluster)
diff -r 1b6fae7cb423 -r 4c279b99f8f8 src/mem/protocol/MOESI_hammer_bcu-BCU.sm
--- a/src/mem/protocol/MOESI_hammer_bcu-BCU.sm	Wed Dec 02 17:08:48 2015 -0600
+++ b/src/mem/protocol/MOESI_hammer_bcu-BCU.sm	Wed Dec 02 17:08:49 2015 -0600
@@ -139,10 +139,10 @@
             }
             /*Do we need write permission*/
             else if (in_msg.Type == CoherenceResponseType:DATA ||
-		     in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE ||
-		     in_msg.Type == CoherenceResponseType:DATA_SHARED){
-		/*With this coherence protocol, hard to tell if this was an
-		innocent read that got O or what. Use dirty bit.*/
+             in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE ||
+             in_msg.Type == CoherenceResponseType:DATA_SHARED){
+                /*With this coherence protocol, hard to tell if this was an
+                innocent read that got O or what. Use dirty bit.*/
                 perm_table.logAddress(in_msg.Addr, true);
                 bool ret := perm_table.checkPLB(in_msg.Addr, in_msg.Dirty);
                 if (ret != true) {
@@ -175,9 +175,9 @@
             /*Do we need read permission*/
             if (in_msg.Type == CoherenceResponseType:UNBLOCK ||
                 in_msg.Type == CoherenceResponseType:UNBLOCKS ||
-		in_msg.Type == CoherenceResponseType:WB_CLEAN ||
-		in_msg.Type == CoherenceResponseType:UNBLOCKM ||
-		in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_CLEAN){ 
+                in_msg.Type == CoherenceResponseType:WB_CLEAN ||
+                in_msg.Type == CoherenceResponseType:UNBLOCKM ||
+                in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_CLEAN){ 
                 perm_table.logAddress(in_msg.Addr, false);
                 bool ret := perm_table.checkPLB(in_msg.Addr, false);
                 if (ret != true) {
@@ -187,7 +187,7 @@
             }
             /*Do we need write permission*/
             else if (in_msg.Type == CoherenceResponseType:WB_DIRTY ||
-		     in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_DIRTY){
+             in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_DIRTY){
                 perm_table.logAddress(in_msg.Addr, true);
                 bool ret := perm_table.checkPLB(in_msg.Addr, true);
                 if (ret != true) {
@@ -195,7 +195,7 @@
                     latency := miss_latency;
                 }
             }
-	    enqueue(unblockNetwork_out, ResponseMsg, latency) {
+        enqueue(unblockNetwork_out, ResponseMsg, latency) {
                 out_msg.Addr := in_msg.Addr;
                 out_msg.Type := in_msg.Type;
                 out_msg.Sender := in_msg.Sender;
@@ -215,29 +215,29 @@
     action(rqc_reqtodir, "rqc", desc="RequestFromCache") {
         peek(requestFromCache_in, RequestMsg) {
             DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr);
-	    Cycles latency := hit_latency;
-	    /*Do we need read permission*/
-	    if (in_msg.Type == CoherenceRequestType:GETX || 
-		in_msg.Type == CoherenceRequestType:GETS || 
-		in_msg.Type == CoherenceRequestType:MERGED_GETS || 
-		in_msg.Type == CoherenceRequestType:GETF){
-		perm_table.logAddress(in_msg.Addr, false);
-		bool ret := perm_table.checkPLB(in_msg.Addr, false);
-		if (ret != true) {
-		    perm_table.checkTable(in_msg.Addr, false);
-		    latency := miss_latency;
-		}
-	    }
-	    /*Do we need write permission*/
-	    else if (in_msg.Type == CoherenceRequestType:PUT || 
-		     in_msg.Type == CoherenceRequestType:PUTF){
-		perm_table.logAddress(in_msg.Addr, true);
-		bool ret := perm_table.checkPLB(in_msg.Addr, true);
-		if (ret != true) {
-		    perm_table.checkTable(in_msg.Addr, true);
-		    latency := miss_latency;
-		}
-	    }
+        Cycles latency := hit_latency;
+        /*Do we need read permission*/
+        if (in_msg.Type == CoherenceRequestType:GETX || 
+            in_msg.Type == CoherenceRequestType:GETS || 
+            in_msg.Type == CoherenceRequestType:MERGED_GETS || 
+            in_msg.Type == CoherenceRequestType:GETF){
+            perm_table.logAddress(in_msg.Addr, false);
+            bool ret := perm_table.checkPLB(in_msg.Addr, false);
+            if (ret != true) {
+                perm_table.checkTable(in_msg.Addr, false);
+                latency := miss_latency;
+            }
+        }
+        /*Do we need write permission*/
+        else if (in_msg.Type == CoherenceRequestType:PUT || 
+             in_msg.Type == CoherenceRequestType:PUTF){
+            perm_table.logAddress(in_msg.Addr, true);
+            bool ret := perm_table.checkPLB(in_msg.Addr, true);
+            if (ret != true) {
+                perm_table.checkTable(in_msg.Addr, true);
+                latency := miss_latency;
+            }
+        }
             enqueue(requestNetwork_out, RequestMsg, latency) {
                 out_msg.Addr := in_msg.Addr;
                 out_msg.Type := in_msg.Type;
diff -r 1b6fae7cb423 -r 4c279b99f8f8 src/mem/protocol/SConsopts
--- a/src/mem/protocol/SConsopts	Wed Dec 02 17:08:48 2015 -0600
+++ b/src/mem/protocol/SConsopts	Wed Dec 02 17:08:49 2015 -0600
@@ -34,6 +34,7 @@
 
 all_protocols.extend([
     'VI_hammer',
+    'VI_hammer_bcu',
     'MOESI_hammer_bcu',
     ])
 
diff -r 1b6fae7cb423 -r 4c279b99f8f8 src/mem/protocol/VI_hammer-msg.sm
--- a/src/mem/protocol/VI_hammer-msg.sm	Wed Dec 02 17:08:48 2015 -0600
+++ b/src/mem/protocol/VI_hammer-msg.sm	Wed Dec 02 17:08:49 2015 -0600
@@ -100,6 +100,7 @@
   MachineID Requestor,            desc="Node who initiated the request";
   NetDest MergedRequestors,    desc="Merge set of read requestors";
   NetDest Destination,             desc="Multicast destination mask";
+  NetDest OriginalDestination,     desc="Multicast destination mask";
   MessageSizeType MessageSize, desc="size category of the message";
   bool DirectedProbe, default="false", desc="probe filter directed probe";
   Cycles InitialRequestTime, default="Cycles(0)", desc="time the initial requests was sent from the L1Cache";
@@ -140,6 +141,7 @@
   MachineID Sender,               desc="Node who sent the data";
   MachineID CurOwner,      desc="current owner of the block, used for UnblockS responses";
   NetDest Destination,             desc="Node to whom the data is sent";
+  NetDest OriginalDestination,     desc="Multicast destination mask";
   DataBlock DataBlk,           desc="data for the cache line";
   bool Dirty,                  desc="Is the data dirty (different than memory)?";
   int Acks, default="0",    desc="How many messages this counts as";
diff -r 1b6fae7cb423 -r 4c279b99f8f8 src/mem/protocol/VI_hammer_bcu-GPUL2cache.sm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/mem/protocol/VI_hammer_bcu-GPUL2cache.sm	Wed Dec 02 17:08:49 2015 -0600
@@ -0,0 +1,1529 @@
+
+machine(GPUL2Cache, "Simple write back L2 cache")
+  : CacheMemory * L2cache;
+  Cycles l2_request_latency := 260;
+  Cycles l2_response_latency := 2;
+  Cycles cache_response_latency := 260;
+
+//Note: we might have a problem if two Get atomics arrive from different L1's at the same time
+
+
+  // NETWORK BUFFERS
+  // Buffers to and from L1 caches
+  MessageBuffer * requestFromL1Cache, network="From", virtual_network="7",
+        ordered="true", vnet_type="request";
+  MessageBuffer * responseToL1Cache, network="To", virtual_network="6",
+        ordered="true", vnet_type="response";
+  MessageBuffer * atomicRequestFromL1Cache, network="From", virtual_network="8",
+        ordered="true", vnet_type="request";
+
+  // Buffers to / from the dir and other caches
+  MessageBuffer * requestFromCache, network="To", virtual_network="2",
+        ordered="false", vnet_type="request";
+  MessageBuffer * responseFromCache, network="To", virtual_network="4",
+        ordered="false", vnet_type="response";
+  MessageBuffer * unblockFromCache, network="To", virtual_network="5",
+        ordered="false", vnet_type="unblock";
+
+  MessageBuffer * forwardToCache, network="From", virtual_network="3",
+        ordered="false", vnet_type="forward";
+  MessageBuffer * responseToCache, network="From", virtual_network="4",
+        ordered="false", vnet_type="response";
+
+{
+  // STATES
+  state_declaration(State, desc="Cache states") {
+    I, AccessPermission:Invalid, desc="Idle";
+    S, AccessPermission:Read_Only, desc="Shared";
+    O, AccessPermission:Read_Only, desc="Owned";
+    M, AccessPermission:Read_Only, desc="Modified (dirty)";
+    MM, AccessPermission:Read_Write, desc="Modified (dirty and locally modified)";
+
+    // States for atomics
+    MM_A, AccessPermission:Busy, "MM^A", desc="Done an atomic get, waiting for the atomic put";
+    IM_A, AccessPermission:Busy, "IM^A", desc="Done an atomic get, like IM";
+    SM_A, AccessPermission:Busy, "SM^A", desc="Done an atomic get, like SM";
+    OM_A, AccessPermission:Busy, "OM^A", desc="Done an atomic get, like OM";
+    SM_AA, AccessPermission:Busy, "SM^AA", desc="Waiting for final acks";
+    IM_AA, AccessPermission:Busy, "IM^AA", desc="Waiting for final acks";
+
+    // Transient states (from hammer)
+    IM, AccessPermission:Busy, "IM", desc="Issued GetX";
+    ISM, AccessPermission:Read_Only, "ISM", desc="Issued GetX, received valid data, waiting for all acks";
+    SM, AccessPermission:Read_Only, "SM", desc="Issued GetX, we still have a valid copy of the line";
+    OM, AccessPermission:Read_Only, "OM", desc="Issued GetX, received data";
+    IS, AccessPermission:Busy, "IS", desc="Issued GetS";
+    SS, AccessPermission:Read_Only, "SS", desc="Issued GetS, received data, waiting for all acks";
+    OI, AccessPermission:Busy, "OI", desc="Issued PutO, waiting for ack";
+    MI, AccessPermission:Busy, "MI", desc="Issued PutX, waiting for ack";
+    II, AccessPermission:Busy, "II", desc="Issued PutX/O, saw Other_GETS or Other_GETX, waiting for ack";
+
+    M_W, AccessPermission:Read_Only, "M^W", desc="Issued GetS, received exclusive data, waiting for acks";
+    MM_W, AccessPermission:Read_Write, "MM^W", desc="Issued GetX, received exclusive data";
+  }
+
+  // EVENTS
+  enumeration(Event, desc="Cache events") {
+    // From L1
+    Get,          desc="Get request from L1";
+    Store,        desc="Put request from L1";
+    Replacement,  desc="Replace a block";
+    Get_Atom,     desc="Atomic get request from L1";
+    Put_Atom,     desc="Atomic put request from L1";
+
+    // From CPU caches
+    Other_GETX,      desc="A GetX from another processor";
+    Other_GETS,      desc="A GetS from another processor";
+    Merged_GETS,     desc="A Merged GetS from another processor";
+    NC_DMA_GETS,     desc="special GetS when only DMA exists";
+    Invalidate,      desc="Invalidate block";
+
+    // ???
+    Block_Ack,       desc="the directory is blocked and ready for the flush";
+
+    // From dir
+    Ack,             desc="Received an ack message";
+    Shared_Ack,      desc="Received an ack message, responder has a shared copy";
+    Data,            desc="Received a data message";
+    Shared_Data,     desc="Received a data message, responder has a shared copy";
+    Exclusive_Data,  desc="Received a data message, responder had an exclusive copy, they gave it to us";
+
+    Writeback_Ack,   desc="Writeback O.K. from directory";
+    Writeback_Nack,  desc="Writeback not O.K. from directory";
+
+    // triggers
+    All_acks,            desc="Received all required data and message acks";
+    All_acks_no_sharers, desc="Received all acks and no other processor has a shared copy";
+  }
+
+  enumeration(RequestType, desc="Type of request for each transition") {
+    DataArrayRead,    desc="L2 Data array read";
+    DataArrayWrite,   desc="L2 Data array write";
+    TagArrayRead,     desc="L2 Tag array read";
+    TagArrayWrite,    desc="L2 Tag array write";
+  }
+
+  // STRUCTURE DEFINITIONS
+
+  // CacheEntry
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,        desc="cache state";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    DataBlock DataBlk,       desc="Data in the block";
+  }
+
+
+  // TBE fields
+  structure(TBE, desc="...") {
+    State TBEState,          desc="Transient state";
+    DataBlock DataBlk,       desc="data for the block, required for concurrent writebacks";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,      desc="Number of acks/data messages that this processor is waiting for";
+    bool Sharers,            desc="On a GetS, did we find any other sharers in the system";
+    bool AppliedSilentAcks, default="false", desc="for full-bit dir, does the pending msg count reflect the silent acks";
+    MachineID LastResponder, desc="last machine to send a response for this request";
+    MachineID CurOwner,      desc="current owner of the block, used for UnblockS responses";
+    Cycles InitialRequestTime, default="Cycles(0)", desc="time the initial requests was sent from the L1Cache";
+    Cycles ForwardRequestTime, default="Cycles(0)", desc="time the dir forwarded the request";
+    Cycles FirstResponseTime, default="Cycles(0)", desc="the Cycles the first response was received";
+
+    DataBlock DirtyDataBlk, desc="Dirty data for a write. Separate from DataBlk since that's 'clean' data from other caches";
+    int Offset,             desc="Offset of write into line";
+    int Size,               desc="Size of the write";
+
+    MachineID Requestor,     desc="The requestor for this block";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Address);
+    void allocate(Address);
+    void deallocate(Address);
+    bool isPresent(Address);
+  }
+
+
+  // STRUCTURES
+
+  TBETable TBEs, template="<GPUL2Cache_TBE>", constructor="m_number_of_TBEs";
+
+  // PROTOTYPES
+  void set_cache_entry(AbstractCacheEntry a);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+
+  // For hammer
+  void wakeUpBuffers(Address a);
+  void wakeUpAllBuffers();
+  Cycles curCycle();
+
+  Entry getCacheEntry(Address address), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", L2cache.lookup(address));
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Address addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    }
+    else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    else {
+      return State:I;
+    }
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Address addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Address addr) {
+      TBE tbe := TBEs[addr];
+      if(is_valid(tbe)) {
+          return GPUL2Cache_State_to_permission(tbe.TBEState);
+      }
+
+      Entry cache_entry := getCacheEntry(addr);
+      if(is_valid(cache_entry)) {
+          return GPUL2Cache_State_to_permission(cache_entry.CacheState);
+      }
+
+      return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Address addr, State state) {
+      if (is_valid(cache_entry)) {
+          cache_entry.changePermission(GPUL2Cache_State_to_permission(state));
+      }
+  }
+
+  DataBlock getDataBlock(Address addr), return_by_ref="yes" {
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+        return cache_entry.DataBlk;
+    }
+
+    TBE tbe := TBEs[addr];
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    }
+
+    error("Missing data block");
+  }
+
+  Event L1Cache_request_type_to_event(CoherenceRequestTypeVI type, Address addr,
+                                      MachineID requestor, Entry cache_entry) {
+    if(type == CoherenceRequestTypeVI:GET) {
+      return Event:Get;
+    } else if (type == CoherenceRequestTypeVI:PUT) {
+      return Event:Store;
+    } else if (type == CoherenceRequestTypeVI:GET_Atom) {
+      return Event:Get_Atom;
+    } else if (type == CoherenceRequestTypeVI:PUT_Atom) {
+      return Event:Put_Atom;
+    }else {
+      error("Invalid L1 request type");
+    }
+  }
+
+  void recordRequestType(RequestType type, Address addr) {
+    if (type == RequestType:DataArrayRead) {
+      L2cache.recordRequestType(CacheRequestType:DataArrayRead);
+    } else if (type == RequestType:DataArrayWrite) {
+      L2cache.recordRequestType(CacheRequestType:DataArrayWrite);
+    } else if (type == RequestType:TagArrayRead) {
+      L2cache.recordRequestType(CacheRequestType:TagArrayRead);
+    } else if (type == RequestType:TagArrayWrite) {
+      L2cache.recordRequestType(CacheRequestType:TagArrayWrite);
+    } else {
+      error("Bad request type passed to recordRequestType");
+    }
+  }
+
+  bool checkResourceAvailable(RequestType type, Address addr) {
+    if (type == RequestType:DataArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (type == RequestType:DataArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (type == RequestType:TagArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (type == RequestType:TagArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Bad request type passed to checkResourceAvailable");
+    }
+  }
+
+  MessageBuffer triggerQueue, ordered="false";
+
+  // NETWORK PORTS
+
+  out_port(responseNetworkL1_out, ResponseMsgVI, responseToL1Cache);
+
+  out_port(requestNetwork_out, RequestMsg, requestFromCache);
+  out_port(unblockNetwork_out, ResponseMsg, unblockFromCache);
+  out_port(responseNetwork_out, ResponseMsg, responseFromCache);
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+
+  // Trigger Queue
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=3) {
+    if (triggerQueue_in.isReady()) {
+      peek(triggerQueue_in, TriggerMsg) {
+
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        TBE tbe := TBEs[in_msg.Addr];
+
+        if (in_msg.Type == TriggerType:ALL_ACKS) {
+          trigger(Event:All_acks, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == TriggerType:ALL_ACKS_NO_SHARERS) {
+          trigger(Event:All_acks_no_sharers, in_msg.Addr, cache_entry, tbe);
+        } else {
+          error("Unexpected message");
+        }
+      }
+    }
+  }
+
+  in_port(responseToCache_in, ResponseMsg, responseToCache, rank=2) {
+    if (responseToCache_in.isReady()) {
+      peek(responseToCache_in, ResponseMsg, block_on="Addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        TBE tbe := TBEs[in_msg.Addr];
+
+        if (in_msg.Type == CoherenceResponseType:ACK) {
+          trigger(Event:Ack, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:ACK_SHARED) {
+          trigger(Event:Shared_Ack, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:DATA) {
+          trigger(Event:Data, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:DATA_SHARED) {
+          trigger(Event:Shared_Data, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE) {
+          trigger(Event:Exclusive_Data, in_msg.Addr, cache_entry, tbe);
+        } else {
+          error("Unexpected message");
+        }
+      }
+    }
+  }
+  // Forward Network
+  in_port(forwardToCache_in, RequestMsg, forwardToCache, rank=1) {
+    if (forwardToCache_in.isReady()) {
+      peek(forwardToCache_in, RequestMsg, block_on="Addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        TBE tbe := TBEs[in_msg.Addr];
+
+        if ((in_msg.Type == CoherenceRequestType:GETX) || (in_msg.Type == CoherenceRequestType:GETF)) {
+          trigger(Event:Other_GETX, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:MERGED_GETS) {
+          trigger(Event:Merged_GETS, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:GETS) {
+          if (machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache) > 1) {
+            trigger(Event:Other_GETS, in_msg.Addr, cache_entry, tbe);
+          } else {
+            trigger(Event:NC_DMA_GETS, in_msg.Addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceRequestType:INV) {
+          trigger(Event:Invalidate, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:WB_ACK) {
+          trigger(Event:Writeback_Ack, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:WB_NACK) {
+          trigger(Event:Writeback_Nack, in_msg.Addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:BLOCK_ACK) {
+          trigger(Event:Block_Ack, in_msg.Addr, cache_entry, tbe);
+        } else {
+          error("Unexpected message");
+        }
+      }
+    }
+  }
+
+  in_port(requestQueue_in, RequestMsgVI, requestFromL1Cache, desc="...") {
+    if (requestQueue_in.isReady()) {
+      peek(requestQueue_in, RequestMsgVI, block_on="Addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        if (is_invalid(cache_entry) &&
+            L2cache.cacheAvail(in_msg.Addr) == false ) {
+          // make room for the block
+          trigger(Event:Replacement, L2cache.cacheProbe(in_msg.Addr),
+                  getCacheEntry(L2cache.cacheProbe(in_msg.Addr)),
+                  TBEs[L2cache.cacheProbe(in_msg.Addr)]);
+        }
+        else {
+          trigger(L1Cache_request_type_to_event(in_msg.Type, in_msg.Addr,
+                                                in_msg.Requestor, cache_entry),
+            in_msg.Addr, cache_entry, TBEs[in_msg.Addr]);
+        }
+      }
+    }
+  }
+
+  in_port(atomicRequestQueue_in, RequestMsgVI, atomicRequestFromL1Cache, desc="...") {
+    if (atomicRequestQueue_in.isReady()) {
+      peek(atomicRequestQueue_in, RequestMsgVI, block_on="Addr") {
+        Entry cache_entry := getCacheEntry(in_msg.Addr);
+        assert(is_valid(cache_entry));
+        trigger(Event:Put_Atom, in_msg.Addr, cache_entry, TBEs[in_msg.Addr]);
+      }
+    }
+  }
+
+  // ACTIONS
+
+  action(a_issueGETS, "a", desc="Issue GETS") {
+    enqueue(requestNetwork_out, RequestMsg, l2_request_latency) {
+      assert(is_valid(tbe));
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceRequestType:GETS;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache); // One from each other cache (n-1) plus the memory (+1)
+    }
+  }
+
+  action(b_issueGETX, "b", desc="Issue GETX") {
+    enqueue(requestNetwork_out, RequestMsg, l2_request_latency) {
+      assert(is_valid(tbe));
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceRequestType:GETX;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+      tbe.NumPendingMsgs := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache); // One from each other cache (n-1) plus the memory (+1)
+    }
+  }
+
+  action(d_issuePUT, "d", desc="Issue PUT") {
+    enqueue(requestNetwork_out, RequestMsg, l2_request_latency) {
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceRequestType:PUT;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Writeback_Control;
+    }
+  }
+
+  action(ii_allocateL2CacheBlock, "\i", desc="Allocate a cache block") {
+    if (is_valid(cache_entry)) {
+    } else {
+      set_cache_entry(L2cache.allocate(address, new Entry));
+    }
+  }
+
+  action(rr_deallocateL2CacheBlock, "\r", desc="deallocate a cache block") {
+    if (is_valid(cache_entry)) {
+      L2cache.deallocate(address);
+      unset_cache_entry();
+    }
+  }
+
+  action(rq_popL1IncomingQueue, "rq", desc="Pop the L1 request queue") {
+    requestQueue_in.dequeue();
+  }
+
+  action(n_popResponseQueue, "n", desc="Pop the response queue") {
+    responseToCache_in.dequeue();
+  }
+
+  action(aq_popL1AtomicQueue, "aq", desc="Pop the atomic L1 request queue") {
+    atomicRequestQueue_in.dequeue();
+  }
+
+  action(j_popTriggerQueue, "j", desc="Pop trigger queue.") {
+    triggerQueue_in.dequeue();
+  }
+
+  action(l_popForwardQueue, "l", desc="Pop forwareded request queue.") {
+    forwardToCache_in.dequeue();
+  }
+
+  action(h_load_hit, "h", desc="Send data to L1.") {
+    assert(is_valid(cache_entry));
+    peek(requestQueue_in, RequestMsgVI) {
+      enqueue(responseNetworkL1_out, ResponseMsgVI, l2_response_latency) {
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseTypeVI:DATA;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+      }
+    }
+    ++L2cache.demand_hits;
+  }
+
+  action(ha_load_hit, "ha", desc="Send data to L1 for atomic") {
+    assert(is_valid(cache_entry));
+    enqueue(responseNetworkL1_out, ResponseMsgVI, l2_response_latency) {
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceResponseTypeVI:DATA;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(tbe.Requestor);
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(hx_external_load_hit, "hx", desc="load required external msgs, send data to L1") {
+    assert(is_valid(cache_entry));
+    assert(is_valid(tbe));
+    peek(responseToCache_in, ResponseMsg) {
+      enqueue(responseNetworkL1_out, ResponseMsgVI, l2_response_latency) {
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseTypeVI:DATA;
+        out_msg.Sender := in_msg.Sender;
+        out_msg.Destination.add(tbe.Requestor);
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+      }
+    }
+  }
+
+  action(hh_store_hit, "\h", desc="Notify L1 that store completed.") {
+    assert(is_valid(cache_entry));
+    peek(requestQueue_in, RequestMsgVI) {
+      cache_entry.DataBlk.copyPartial(in_msg.DataBlk, in_msg.Offset, in_msg.Size);
+      cache_entry.Dirty := true;
+    }
+    ++L2cache.demand_hits;
+    DPRINTF(RubySlicc, "%s %s\n", address, cache_entry.DataBlk);
+  }
+
+  action(sx_external_store_hit, "sx", desc="store required external msgs, Notify L1 that store completed.") {
+    assert(is_valid(cache_entry));
+    assert(is_valid(tbe));
+    cache_entry.DataBlk.copyPartial(tbe.DirtyDataBlk, tbe.Offset, tbe.Size);
+    cache_entry.Dirty := true;
+    peek(responseToCache_in, ResponseMsg) {
+      if (machineIDToMachineType(in_msg.Sender) == MachineType:Directory) {
+        //profileGPUL2WriteMiss(GenericMachineType:Directory);
+      }
+    }
+    DPRINTF(RubySlicc, "From L1: %s %s\n", address, tbe.DirtyDataBlk);
+    DPRINTF(RubySlicc, "%s: offset: %d, size: %d\n", address, tbe.Offset, tbe.Size);
+    DPRINTF(RubySlicc, "%s %s\n", address, cache_entry.DataBlk);
+  }
+
+  action(sxt_trig_ext_store_hit, "sxt", desc="store required external msgs, Notify L1 that store completed.") {
+    assert(is_valid(cache_entry));
+    assert(is_valid(tbe));
+    cache_entry.DataBlk.copyPartial(tbe.DirtyDataBlk, tbe.Offset, tbe.Size);
+    cache_entry.Dirty := true;
+    if (machineIDToMachineType(tbe.LastResponder) == MachineType:Directory) {
+      //profileGPUL2WriteMiss(GenericMachineType:Directory);
+    } else if (machineIDToMachineType(tbe.LastResponder) == MachineType:L1Cache) {
+      //profileGPUL2WriteMiss(GenericMachineType:L1Cache_wCC);
+    } else if (machineIDToMachineType(tbe.LastResponder) == MachineType:GPUL2Cache) {
+      //profileGPUL2WriteMiss(GenericMachineType:L1Cache_wCC);
+    } else {
+      error("Only expect responses from Directory, L1Cache or GPUL2Cache");
+    }
+    DPRINTF(RubySlicc, "From L1: %s %s\n", address, tbe.DirtyDataBlk);
+    DPRINTF(RubySlicc, "%s: offset: %d, size: %d\n", address, tbe.Offset, tbe.Size);
+    DPRINTF(RubySlicc, "%s %s\n", address, cache_entry.DataBlk);
+  }
+
+  action(sa_store_hit, "sa", desc="Notify L1 that an atomic store completed.") {
+    assert(is_valid(cache_entry));
+    peek(atomicRequestQueue_in, RequestMsgVI) {
+      enqueue(responseNetworkL1_out, ResponseMsgVI, l2_response_latency) {
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseTypeVI:WB_ACK;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+    ++L2cache.demand_hits;
+  }
+
+  action(as_ackStore, "as", desc="Ack the requestor that the store is complete") {
+    peek(requestQueue_in, RequestMsgVI) {
+      enqueue(responseNetworkL1_out, ResponseMsgVI, l2_response_latency) {
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseTypeVI:WB_ACK;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(aes_ackExternalStore, "aes", desc="Ack the requestor that the store is complete") {
+    assert(is_valid(tbe));
+    enqueue(responseNetworkL1_out, ResponseMsgVI, l2_response_latency) {
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceResponseTypeVI:WB_ACK;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(tbe.Requestor);
+      out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+      DPRINTF(RubySlicc, "%s %s\n", address, tbe.Requestor);
+    }
+  }
+
+  action(es_recordRequestor, "es", desc="record the requestor ID in the TBE") {
+    assert(is_valid(tbe));
+    peek(requestQueue_in, RequestMsgVI) {
+      tbe.Requestor := in_msg.Requestor;
+      tbe.DirtyDataBlk := in_msg.DataBlk;
+      tbe.Offset := in_msg.Offset;
+      tbe.Size := in_msg.Size;
+      DPRINTF(RubySlicc, "Recording requestor %s %s\n", address, in_msg.Requestor);
+    }
+  }
+
+  action(u_writeDataToCache, "u", desc="Write data to cache") {
+    peek(responseToCache_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(i_allocateTBE, "i", desc="Allocate TBE") {
+    check_allocate(TBEs);
+    assert(is_valid(cache_entry));
+    TBEs.allocate(address);
+    set_tbe(TBEs[address]);
+    tbe.DataBlk := cache_entry.DataBlk; // Data only used for writebacks
+    tbe.Dirty := cache_entry.Dirty;
+    tbe.Sharers := false;
+  }
+
+  action(s_deallocateTBE, "s", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(z_stall, "z", desc="Stall") {
+    // empty
+  }
+
+  action(zz_stallAndWaitRequestQueue, "\z", desc="...") {
+    stall_and_wait(requestQueue_in, address);
+  }
+
+  action(m_decrementNumberOfMessages, "m", desc="Decrement the number of messages for which we're waiting") {
+    peek(responseToCache_in, ResponseMsg) {
+      assert(in_msg.Acks >= 0);
+      assert(is_valid(tbe));
+      DPRINTF(RubySlicc, "Sender = %s\n", in_msg.Sender);
+      DPRINTF(RubySlicc, "SilentAcks = %d\n", in_msg.SilentAcks);
+      if (tbe.AppliedSilentAcks == false) {
+        tbe.NumPendingMsgs := tbe.NumPendingMsgs - in_msg.SilentAcks;
+        tbe.AppliedSilentAcks := true;
+      }
+      DPRINTF(RubySlicc, "%d\n", tbe.NumPendingMsgs);
+      tbe.NumPendingMsgs := tbe.NumPendingMsgs - in_msg.Acks;
+      DPRINTF(RubySlicc, "%d\n", tbe.NumPendingMsgs);
+      APPEND_TRANSITION_COMMENT(tbe.NumPendingMsgs);
+      APPEND_TRANSITION_COMMENT(in_msg.Sender);
+      tbe.LastResponder := in_msg.Sender;
+      if (tbe.InitialRequestTime != zero_time() && in_msg.InitialRequestTime != zero_time()) {
+        assert(tbe.InitialRequestTime == in_msg.InitialRequestTime);
+      }
+      if (in_msg.InitialRequestTime != zero_time()) {
+        tbe.InitialRequestTime := in_msg.InitialRequestTime;
+      }
+      if (tbe.ForwardRequestTime != zero_time() && in_msg.ForwardRequestTime != zero_time()) {
+        assert(tbe.ForwardRequestTime == in_msg.ForwardRequestTime);
+      }
+      if (in_msg.ForwardRequestTime != zero_time()) {
+        tbe.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+      if (tbe.FirstResponseTime == zero_time()) {
+        tbe.FirstResponseTime := curCycle();
+      }
+    }
+  }
+
+  action(o_checkForCompletion, "o", desc="Check if we have received all the messages required for completion") {
+    assert(is_valid(tbe));
+    if (tbe.NumPendingMsgs == 0) {
+      enqueue(triggerQueue_out, TriggerMsg) {
+        out_msg.Addr := address;
+        if (tbe.Sharers) {
+          out_msg.Type := TriggerType:ALL_ACKS;
+        } else {
+          out_msg.Type := TriggerType:ALL_ACKS_NO_SHARERS;
+        }
+      }
+    }
+  }
+
+  action(uo_updateCurrentOwner, "uo", desc="When moving SS state, update current owner.") {
+    peek(responseToCache_in, ResponseMsg) {
+      assert(is_valid(tbe));
+      tbe.CurOwner := in_msg.Sender;
+    }
+  }
+
+  action(p_decrementNumberOfMessagesByOne, "p", desc="Decrement the number of messages for which we're waiting by one") {
+    assert(is_valid(tbe));
+    tbe.NumPendingMsgs := tbe.NumPendingMsgs - 1;
+  }
+
+  action(pp_incrementNumberOfMessagesByOne, "\p", desc="Increment the number of messages for which we're waiting by one") {
+    assert(is_valid(tbe));
+    tbe.NumPendingMsgs := tbe.NumPendingMsgs + 1;
+  }
+
+  action(kd_wakeUpDependents, "kd", desc="wake-up dependents") {
+    wakeUpBuffers(address);
+  }
+
+  action(ka_wakeUpAllDependents, "ka", desc="wake-up all dependents") {
+    wakeUpAllBuffers();
+  }
+
+  action(r_setSharerBit, "r", desc="We saw other sharers") {
+    assert(is_valid(tbe));
+    tbe.Sharers := true;
+  }
+
+  action(gm_sendUnblockM, "gm", desc="Send unblock to memory and indicate M/O/E state") {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceResponseType:UNBLOCKM;
+      out_msg.Sender := machineID;
+      out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+    }
+  }
+
+  action(gs_sendUnblockS, "gs", desc="Send unblock to memory and indicate S state") {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      assert(is_valid(tbe));
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceResponseType:UNBLOCKS;
+      out_msg.Sender := machineID;
+      out_msg.CurOwner := tbe.CurOwner;
+      out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+    }
+  }
+
+  action(v_writeDataToCacheVerify, "v", desc="Write data to cache, assert it was same as before") {
+    peek(responseToCache_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      DPRINTF(RubySlicc, "Cached Data Block: %s, Msg Data Block: %s\n",
+              cache_entry.DataBlk, in_msg.DataBlk);
+      assert(cache_entry.DataBlk == in_msg.DataBlk);
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty || cache_entry.Dirty;
+    }
+  }
+
+  action(q_sendDataFromTBEToCache, "q", desc="Send data from TBE to cache") {
+    peek(forwardToCache_in, RequestMsg) {
+        assert(in_msg.Requestor != machineID);
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        DPRINTF(RubySlicc, "%s\n", out_msg.Destination);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(sq_sendSharedDataFromTBEToCache, "sq", desc="Send shared data from TBE to cache, still the owner") {
+    peek(forwardToCache_in, RequestMsg) {
+        assert(in_msg.Requestor != machineID);
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        DPRINTF(RubySlicc, "%s\n", out_msg.Destination);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(qm_sendDataFromTBEToCache, "qm", desc="Send data from TBE to cache, multiple sharers, still the owner") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination := in_msg.MergedRequestors;
+        DPRINTF(RubySlicc, "%s\n", out_msg.Destination);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache);
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(qq_sendDataFromTBEToMemory, "\q", desc="Send data from TBE to memory") {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      assert(is_valid(tbe));
+      out_msg.Addr := address;
+      out_msg.Sender := machineID;
+      out_msg.Destination.broadcast(MachineType:BorderControlUnit);
+      out_msg.OriginalDestination.add(map_Address_to_Directory(address));
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Dirty) {
+        out_msg.Type := CoherenceResponseType:WB_DIRTY;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+      } else {
+        out_msg.Type := CoherenceResponseType:WB_CLEAN;
+        // NOTE: in a real system this would not send data.  We send
+        // data here only so we can check it at the memory
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(t_sendExclusiveDataFromTBEToMemory, "t", desc="Send exclusive data from TBE to memory") {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      assert(is_valid(tbe));
+      out_msg.Addr := address;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Dirty) {
+        out_msg.Type := CoherenceResponseType:WB_EXCLUSIVE_DIRTY;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+      } else {
+        out_msg.Type := CoherenceResponseType:WB_EXCLUSIVE_CLEAN;
+        // NOTE: in a real system this would not send data.  We send
+        // data here only so we can check it at the memory
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(f_sendAck, "f", desc="Send ack from cache to requestor") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:ACK;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Acks := 1;
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        assert(in_msg.DirectedProbe == false);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(ff_sendAckShared, "\f", desc="Send shared ack from cache to requestor") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:ACK_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Acks := 1;
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        assert(in_msg.DirectedProbe == false);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(g_sendUnblock, "g", desc="Send unblock to memory") {
+    enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) {
+      out_msg.Addr := address;
+      out_msg.Type := CoherenceResponseType:UNBLOCK;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+    }
+  }
+
+  action(e_sendData, "e", desc="Send data from cache to requestor") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.Dirty := cache_entry.Dirty;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(ee_sendDataShared, "\e", desc="Send data from cache to requestor, remaining the owner") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.Dirty := cache_entry.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk);
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(c_sendExclusiveData, "c", desc="Send exclusive data from cache to requestor") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.Dirty := cache_entry.Dirty;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(em_sendDataSharedMultiple, "em", desc="Send data from cache to all requestors, still the owner") {
+    peek(forwardToCache_in, RequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.Addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_SHARED;
+        out_msg.Sender := machineID;
+        out_msg.Destination := in_msg.MergedRequestors;
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.Dirty := cache_entry.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk);
+        out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache);
+        out_msg.SilentAcks := in_msg.SilentAcks;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+    }
+  }
+
+  action(uu_profileWriteMiss, "uu", desc="...") {
+    ++L2cache.demand_misses;
+  }
+
+  action(vv_profileReadMiss, "vv", desc="...") {
+    ++L2cache.demand_misses;
+  }
+
+  // TRANSITIONS
+
+  transition({IM, IS, OI, MI, II}, {Get, Store, Replacement, Get_Atom}) {} {
+    zz_stallAndWaitRequestQueue;
+  }
+
+  transition({ISM,SM,OM,SS}, {Replacement,Store,Get_Atom}) {} {
+    zz_stallAndWaitRequestQueue;
+  }
+
+  transition({M_W,MM_W}, {Replacement, Get_Atom}) {} {
+    zz_stallAndWaitRequestQueue;
+  }
+
+  transition(I, Store, IM) {TagArrayRead, TagArrayWrite} {
+    ii_allocateL2CacheBlock;
+    i_allocateTBE;
+    es_recordRequestor;
+    b_issueGETX;
+    uu_profileWriteMiss;
+    rq_popL1IncomingQueue;
+  }
+
+  transition({S,MM,O,M}, Get) {TagArrayRead, DataArrayRead} {
+    h_load_hit;
+    rq_popL1IncomingQueue;
+  }
+
+  transition({SS,M_W,MM_W,SM,OM,ISM}, Get) {DataArrayRead} {
+    h_load_hit;
+    rq_popL1IncomingQueue;
+  }
+
+  transition(MM, Store) {TagArrayRead, DataArrayWrite} {
+    hh_store_hit;
+    as_ackStore;
+    rq_popL1IncomingQueue;
+  }
+
+  transition(MM_W, Store) {DataArrayWrite} {
+    hh_store_hit;
+    as_ackStore;
+    rq_popL1IncomingQueue;
+  }
+
+  transition(M, Store, MM) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    hh_store_hit;
+    as_ackStore;
+    rq_popL1IncomingQueue;
+  }
+
+  transition(O, Store, OM) {TagArrayRead} {
+    i_allocateTBE;
+    es_recordRequestor;
+    b_issueGETX;
+    p_decrementNumberOfMessagesByOne;
+    uu_profileWriteMiss;
+    rq_popL1IncomingQueue;
+  }
+
+  transition(S, Store, SM) {TagArrayRead} {
+    i_allocateTBE;
+    es_recordRequestor;
+    b_issueGETX;
+    uu_profileWriteMiss;
+    rq_popL1IncomingQueue;
+  }
+
+  transition(I, Get, IS) {TagArrayRead} {
+    ii_allocateL2CacheBlock;
+    i_allocateTBE;
+    es_recordRequestor;
+    a_issueGETS;
+    vv_profileReadMiss;
+    rq_popL1IncomingQueue;
+  }
+
+  // Let's deal with atomics
+
+  transition(MM, Get_Atom, MM_A) {
+    i_allocateTBE;
+    es_recordRequestor;
+    ha_load_hit;
+    rq_popL1IncomingQueue;
+  }
+
+  transition(I, Get_Atom, IM_A) {
+    ii_allocateL2CacheBlock;
+    i_allocateTBE;
+    es_recordRequestor;
+    b_issueGETX;
+    //uu_profileMiss; // TODO
+    rq_popL1IncomingQueue;
+  }
+
+  transition(S, Get_Atom, SM_A) {
+    i_allocateTBE;
+    es_recordRequestor;
+    b_issueGETX;
+    //uu_profileMiss; // TODO
+    rq_popL1IncomingQueue;
+  }
+
+  transition(M, Get_Atom, MM_A) {
+    i_allocateTBE;
+    es_recordRequestor;
+    h_load_hit;
+    rq_popL1IncomingQueue;
+  }
+
+  transition(O, Get_Atom, OM_A) {
+    i_allocateTBE;
+    es_recordRequestor;
+    b_issueGETX;
+    p_decrementNumberOfMessagesByOne;
+    //uu_profileMiss; // TODO
+    rq_popL1IncomingQueue;
+  }
+
+  transition(MM_A, Put_Atom, MM) {
+    sa_store_hit;
+    s_deallocateTBE;
+    aq_popL1AtomicQueue;
+  }
+
+  transition(SM_A, {Data, Exclusive_Data}, SM_AA) {
+    v_writeDataToCacheVerify;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(IM_A, {Data, Exclusive_Data}, IM_AA) {
+    u_writeDataToCache;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+    //kd_wakeUpDependents; // This is only for ex data, I don't think we need it
+  }
+
+  transition(OM_A, {All_acks, All_acks_no_sharers}, MM_A) {
+    ha_load_hit;
+    gm_sendUnblockM;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(IM_AA, All_acks_no_sharers, MM_A) {
+    ha_load_hit;
+    gm_sendUnblockM;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(SM_AA, All_acks_no_sharers, MM_A) {
+    ha_load_hit;
+    gm_sendUnblockM;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(SM_AA, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition({IM_A,SM_A,OM_A,MM_A,SM_AA,IM_AA}, {Get,Get_Atom,Store,Replacement,Other_GETX,Other_GETS,Merged_GETS,NC_DMA_GETS,Invalidate}){
+    z_stall;
+  }
+
+  // Transistions for replacements
+
+  transition(I, Replacement) {TagArrayRead} {
+    rr_deallocateL2CacheBlock;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(S, Replacement, I) {TagArrayRead, TagArrayWrite} {
+    rr_deallocateL2CacheBlock;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(O, Replacement, OI) {TagArrayRead} {
+    i_allocateTBE;
+    d_issuePUT;
+    rr_deallocateL2CacheBlock;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(O, Merged_GETS) {TagArrayRead, DataArrayRead} {
+    em_sendDataSharedMultiple;
+    l_popForwardQueue;
+  }
+
+  transition({M,MM}, Replacement, MI) {TagArrayRead, DataArrayRead} {
+    i_allocateTBE;
+    d_issuePUT;
+    rr_deallocateL2CacheBlock;
+    ka_wakeUpAllDependents;
+  }
+
+  transition(MI, Writeback_Ack, I) {TagArrayWrite} {
+    t_sendExclusiveDataFromTBEToMemory;
+    s_deallocateTBE;
+    l_popForwardQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(OI, Writeback_Ack, I) {TagArrayWrite} {
+    qq_sendDataFromTBEToMemory;
+    s_deallocateTBE;
+    l_popForwardQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition({OI, MI}, {Other_GETX, Invalidate}, II) {
+    q_sendDataFromTBEToCache;
+    l_popForwardQueue;
+  }
+
+  transition({OI, MI}, {NC_DMA_GETS, Other_GETS}, OI) {
+    sq_sendSharedDataFromTBEToCache;
+    l_popForwardQueue;
+  }
+
+  transition({OI, MI}, Merged_GETS, OI) {
+    qm_sendDataFromTBEToCache;
+    l_popForwardQueue;
+  }
+
+  // Transitions based on reponses
+
+  // Transitions from IS
+
+  transition(IS, {Other_GETX, NC_DMA_GETS, Other_GETS, Invalidate}) {
+    f_sendAck;
+    l_popForwardQueue;
+  }
+
+  transition(IS, Data, SS) {DataArrayWrite} {
+    u_writeDataToCache;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    hx_external_load_hit;
+    uo_updateCurrentOwner;
+    n_popResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(IS, Exclusive_Data, M_W) {DataArrayWrite} {
+    u_writeDataToCache;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    hx_external_load_hit;
+    n_popResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(IS, Shared_Data, SS) {DataArrayWrite} {
+    u_writeDataToCache;
+    r_setSharerBit;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    hx_external_load_hit;
+    uo_updateCurrentOwner;
+    n_popResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(IS, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(IS, Shared_Ack) {
+    m_decrementNumberOfMessages;
+    r_setSharerBit;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  // Transitions from M_W
+  transition(M_W, Store, MM_W) {DataArrayWrite} {
+    hh_store_hit;
+    as_ackStore;
+    rq_popL1IncomingQueue;
+  }
+
+  transition(M_W, Ack)  {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(M_W, All_acks_no_sharers, M) {TagArrayWrite} {
+    gm_sendUnblockM;
+    s_deallocateTBE;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  // Transitions from MM_W
+
+  transition(MM_W, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(MM_W, All_acks_no_sharers, MM) {TagArrayWrite} {
+    gm_sendUnblockM;
+    s_deallocateTBE;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  // Transitions from IM
+  transition(SM, {NC_DMA_GETS, Other_GETS}) {
+    ff_sendAckShared;
+    l_popForwardQueue;
+  }
+
+  transition(SM, {Other_GETX, Invalidate}, IM) {
+    f_sendAck;
+    l_popForwardQueue;
+  }
+
+  transition(IM, {Other_GETX, NC_DMA_GETS, Other_GETS, Invalidate}) {
+    f_sendAck;
+    l_popForwardQueue;
+  }
+
+  transition({IM, IM_A, IM_AA}, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(IM, Data, ISM) {DataArrayWrite} {
+    u_writeDataToCache;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    kd_wakeUpDependents;
+    n_popResponseQueue;
+  }
+
+  transition(IM, Exclusive_Data, MM_W) {DataArrayWrite} {
+    u_writeDataToCache;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    sx_external_store_hit;
+    aes_ackExternalStore;
+    n_popResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  // Transitions from ISM
+  transition(ISM, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(ISM, All_acks_no_sharers, MM) {DataArrayWrite, TagArrayWrite} {
+    sxt_trig_ext_store_hit;
+    aes_ackExternalStore;
+    gm_sendUnblockM;
+    s_deallocateTBE;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  // Transitions from SS
+  transition(SS, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(SS, Shared_Ack) {
+    m_decrementNumberOfMessages;
+    r_setSharerBit;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(SS, All_acks, S) {TagArrayWrite} {
+    gs_sendUnblockS;
+    s_deallocateTBE;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(SS, All_acks_no_sharers, S) {TagArrayWrite} {
+    // Note: The directory might still be the owner, so that is why we go to S
+    gs_sendUnblockS;
+    s_deallocateTBE;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  // Transitions from OM
+
+  transition(OM, {Other_GETX, Invalidate}, IM) {
+    e_sendData;
+    pp_incrementNumberOfMessagesByOne;
+    l_popForwardQueue;
+  }
+
+  transition(OM, {NC_DMA_GETS, Other_GETS}) {
+    ee_sendDataShared;
+    l_popForwardQueue;
+  }
+
+  transition(OM, Merged_GETS) {
+    em_sendDataSharedMultiple;
+    l_popForwardQueue;
+  }
+
+  transition({OM, OM_A}, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(OM, {All_acks, All_acks_no_sharers}, MM) {TagArrayWrite, DataArrayWrite} {
+    sxt_trig_ext_store_hit;
+    aes_ackExternalStore;
+    gm_sendUnblockM;
+    s_deallocateTBE;
+    j_popTriggerQueue;
+    kd_wakeUpDependents;
+  }
+
+  // Transitions from SM
+  transition({SM, SM_A}, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(SM, {Data, Exclusive_Data}, ISM) {
+    v_writeDataToCacheVerify;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  // Transitions for other cache requests
+
+  transition(I, {Other_GETX, NC_DMA_GETS, Other_GETS, Invalidate}) {TagArrayRead} {
+    f_sendAck;
+    l_popForwardQueue;
+  }
+
+  transition(S, {NC_DMA_GETS, Other_GETS}) {TagArrayRead} {
+    ff_sendAckShared;
+    l_popForwardQueue;
+  }
+
+  transition(S, {Other_GETX, Invalidate}, I) {TagArrayRead, TagArrayWrite} {
+    f_sendAck;
+    l_popForwardQueue;
+  }
+
+  transition(O, {Other_GETX, Invalidate}, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    e_sendData;
+    l_popForwardQueue;
+  }
+
+  transition(O, {NC_DMA_GETS, Other_GETS}) {TagArrayRead, DataArrayRead} {
+    ee_sendDataShared;
+    l_popForwardQueue;
+  }
+
+  transition(MM, {Other_GETX, Invalidate}, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    c_sendExclusiveData;
+    l_popForwardQueue;
+  }
+
+  transition(MM, Other_GETS, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    c_sendExclusiveData;
+    l_popForwardQueue;
+  }
+
+  transition(MM, NC_DMA_GETS, O) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    ee_sendDataShared;
+    l_popForwardQueue;
+  }
+
+  transition(MM, Merged_GETS, O) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    em_sendDataSharedMultiple;
+    l_popForwardQueue;
+  }
+
+  transition(M, {Other_GETX, Invalidate}, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    c_sendExclusiveData;
+    l_popForwardQueue;
+  }
+
+  transition(M, Other_GETS, O) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    ee_sendDataShared;
+    l_popForwardQueue;
+  }
+
+  transition(M, NC_DMA_GETS, O) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    ee_sendDataShared;
+    l_popForwardQueue;
+  }
+
+  transition(M, Merged_GETS, O) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    em_sendDataSharedMultiple;
+    l_popForwardQueue;
+  }
+
+  // Transitions from II
+  transition(II, {NC_DMA_GETS, Other_GETS, Other_GETX, Invalidate}, II) {
+    f_sendAck;
+    l_popForwardQueue;
+  }
+
+  transition(II, Writeback_Ack, I) {
+    g_sendUnblock;
+    s_deallocateTBE;
+    l_popForwardQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(II, Writeback_Nack, I) {
+    s_deallocateTBE;
+    l_popForwardQueue;
+    kd_wakeUpDependents;
+  }
+
+}
+
diff -r 1b6fae7cb423 -r 4c279b99f8f8 src/mem/protocol/VI_hammer_bcu.slicc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/mem/protocol/VI_hammer_bcu.slicc	Wed Dec 02 17:08:49 2015 -0600
@@ -0,0 +1,15 @@
+protocol "VI_hammer_bcu";
+
+include "RubySlicc_interfaces.slicc";
+
+include "VI_hammer-msg.sm";
+include "VI_hammer-CPUCache.sm";
+
+include "VI_hammer-GPUL1cache.sm";
+include "VI_hammer_bcu-GPUL2cache.sm";
+include "VI-ce.sm";
+
+include "VI_hammer-dir.sm";
+include "VI_hammer-dma.sm";
+
+include "MOESI_hammer_bcu-BCU.sm";
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449097729 21600
# Node ID ea23ae7dc3ff9da72b844b225d5cf7577b1ee328
# Parent  4c279b99f8f872610c293d4581e2849ccbae9827
Update VI_hammer and BCU such that its sequencers can't restore

diff -r 4c279b99f8f8 -r ea23ae7dc3ff configs/gpu_protocol/VI_hammer_bcu_fusion.py
--- a/configs/gpu_protocol/VI_hammer_bcu_fusion.py	Wed Dec 02 17:08:49 2015 -0600
+++ b/configs/gpu_protocol/VI_hammer_bcu_fusion.py	Wed Dec 02 17:08:49 2015 -0600
@@ -108,7 +108,8 @@
                             max_outstanding_requests = options.gpu_l1_buf_depth,
                             ruby_system = ruby_system,
                             deadlock_threshold = 2000000,
-                            connect_to_io = False)
+                            connect_to_io = False,
+                            can_restore = False)
 
         l1_cntrl.sequencer = gpu_seq
 
@@ -216,7 +217,8 @@
                             max_outstanding_requests = options.gpu_l1_buf_depth,
                             ruby_system = ruby_system,
                             deadlock_threshold = 2000000,
-                            connect_to_io = False)
+                            connect_to_io = False,
+                            can_restore = False)
 
     l1_cntrl.sequencer = cpu_seq
 
@@ -250,7 +252,8 @@
                                max_outstanding_requests = 64,
                                support_inst_reqs = False,
                                ruby_system = ruby_system,
-                               connect_to_io = False)
+                               connect_to_io = False,
+                               can_restore = False)
 
     gpu_ce_cntrl = GPUCopyDMA_Controller(version = 0,
                                   sequencer = gpu_ce_seq,
diff -r 4c279b99f8f8 -r ea23ae7dc3ff configs/gpu_protocol/VI_hammer_fusion.py
--- a/configs/gpu_protocol/VI_hammer_fusion.py	Wed Dec 02 17:08:49 2015 -0600
+++ b/configs/gpu_protocol/VI_hammer_fusion.py	Wed Dec 02 17:08:49 2015 -0600
@@ -108,7 +108,8 @@
                             max_outstanding_requests = options.gpu_l1_buf_depth,
                             ruby_system = ruby_system,
                             deadlock_threshold = 2000000,
-                            connect_to_io = False)
+                            connect_to_io = False,
+                            can_restore = False)
 
         l1_cntrl.sequencer = gpu_seq
 
@@ -216,7 +217,8 @@
                             max_outstanding_requests = options.gpu_l1_buf_depth,
                             ruby_system = ruby_system,
                             deadlock_threshold = 2000000,
-                            connect_to_io = False)
+                            connect_to_io = False,
+                            can_restore = False)
 
     l1_cntrl.sequencer = cpu_seq
 
@@ -250,7 +252,8 @@
                                max_outstanding_requests = 64,
                                support_inst_reqs = False,
                                ruby_system = ruby_system,
-                               connect_to_io = False)
+                               connect_to_io = False,
+                               can_restore = False)
 
     gpu_ce_cntrl = GPUCopyDMA_Controller(version = 0,
                                   sequencer = gpu_ce_seq,
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449097729 21600
# Node ID f3b35cc4f72d2d58f0fcf16b6a62ac1e73d5e742
# Parent  ea23ae7dc3ff9da72b844b225d5cf7577b1ee328
imported patch micro_submit_state

diff -r ea23ae7dc3ff -r f3b35cc4f72d configs/GPUConfig.py
--- a/configs/GPUConfig.py	Wed Dec 02 17:08:49 2015 -0600
+++ b/configs/GPUConfig.py	Wed Dec 02 17:08:49 2015 -0600
@@ -66,6 +66,12 @@
     parser.add_option("--plb_size", default=64, help="Entries in the PLB (Border Control Buffer/Cache")
     parser.add_option("--plb_alignment", default=0, help="log of addresses per entry in PLB")
 
+    parser.add_option("--concurrent_walks", type="int", default=32, help="number of concurrent page walks")
+    parser.add_option("--pwc_latency", type="int", default=8, help="")
+    parser.add_option("--pw_l2_latency", type="int", default=0, help="if 0, we get the value from elsewhere")
+    parser.add_option("--mmu_latency", type="int", default=20, help="")
+    parser.add_option("--l2_tlb_entries", type="int", default=0, help="")
+
 def configureMemorySpaces(options):
     total_mem_range = AddrRange(options.total_mem_size)
     cpu_mem_range = total_mem_range
@@ -189,6 +195,7 @@
     for sc in gpu.shader_cores:
         sc.lsq = ShaderLSQ()
         sc.lsq.data_tlb.entries = options.gpu_tlb_entries
+        sc.lsq.data_tlb.associativity = options.gpu_tlb_assoc
         sc.lsq.forward_flush = (buildEnv['PROTOCOL'] == 'VI_hammer_fusion' \
                                 and options.flush_kernel_end)
         sc.lsq.warp_size = options.gpu_warp_size
@@ -231,10 +238,13 @@
     # Initialize the MMU, connecting it to either the pagewalk cache port for
     # unified address space, or the copy engine's host-side sequencer port for
     # split address space architectures.
-    gpu.shader_mmu.setUpPagewalkers(32,
+    gpu.shader_mmu.setUpPagewalkers(options.concurrent_walks,
                     ruby._cpu_ports[options.num_cpus+options.num_sc].slave,
                     options.gpu_tlb_bypass_l1)
 
+    gpu.shader_mmu.latency = options.mmu_latency
+    gpu.shader_mmu.l2_tlb_entries = options.l2_tlb_entries
+
     if options.split:
         # NOTE: In split address space architectures, the MMU only provides the
         # copy engine host-side TLB access to a page walker. This should
diff -r ea23ae7dc3ff -r f3b35cc4f72d configs/fs_fusion.py
--- a/configs/fs_fusion.py	Wed Dec 02 17:08:49 2015 -0600
+++ b/configs/fs_fusion.py	Wed Dec 02 17:08:49 2015 -0600
@@ -64,6 +64,8 @@
 #
 Ruby.define_options(parser)
 
+parser.add_option("--flush_tick", default=0, type="int")
+
 (options, args) = parser.parse_args()
 
 options.ruby = True
@@ -141,8 +143,9 @@
     system.gpu_physmem = SimpleMemory(range = gpu_mem_range)
     system.gpu_physmem.port = system.iobus.master
 
-system.gpu.test_tlb_shootdown = True
-system.gpu.tlb_shootdown_tick = 5461844154685 + (29247313 / 4)
+if options.flush_tick:
+    system.gpu.test_tlb_shootdown = True
+    system.gpu.tlb_shootdown_tick = options.flush_tick 
 
 #
 # Setup Ruby
diff -r ea23ae7dc3ff -r f3b35cc4f72d configs/gpu_config/gpgpusim.config.template
--- a/configs/gpu_config/gpgpusim.config.template	Wed Dec 02 17:08:49 2015 -0600
+++ b/configs/gpu_config/gpgpusim.config.template	Wed Dec 02 17:08:49 2015 -0600
@@ -1,3 +1,5 @@
+-gpgpu_deadlock_detect 0
+
 # functional simulator specification
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
diff -r ea23ae7dc3ff -r f3b35cc4f72d configs/gpu_protocol/VI_hammer_bcu_fusion.py
--- a/configs/gpu_protocol/VI_hammer_bcu_fusion.py	Wed Dec 02 17:08:49 2015 -0600
+++ b/configs/gpu_protocol/VI_hammer_bcu_fusion.py	Wed Dec 02 17:08:49 2015 -0600
@@ -182,7 +182,7 @@
                             assoc = 16, # 64 is fully associative @ 8kB
                             replacement_policy = "LRU",
                             start_index_bit = block_size_bits,
-                            latency = 8,
+                            latency = options.pwc_latency,
                             resourceStalls = False)
     # Small cache since CPU L1 requires I and D
     pwi_cache = L1Cache(size = "512B",
@@ -198,13 +198,17 @@
                            start_index_bit = block_size_bits,
                            latency = 1,
                            resourceStalls = False)
+    if (options.pw_l2_latency == 0):
+        m_issue_latency = l1_to_l2_noc_latency
+    else:
+        m_issue_latency = options.pw_l2_latency
 
     l1_cntrl = L1Cache_Controller(version = options.num_cpus,
                                   L1Icache = pwi_cache,
                                   L1Dcache = pwd_cache,
                                   L2cache = l2_cache,
                                   send_evictions = False,
-                                  issue_latency = l1_to_l2_noc_latency,
+                                  issue_latency = m_issue_latency, #this should be pwc_latency
                                   cache_response_latency = 1,
                                   l2_cache_hit_latency = 1,
                                   number_of_TBEs = options.gpu_l1_buf_depth,
diff -r ea23ae7dc3ff -r f3b35cc4f72d configs/gpu_protocol/VI_hammer_fusion.py
--- a/configs/gpu_protocol/VI_hammer_fusion.py	Wed Dec 02 17:08:49 2015 -0600
+++ b/configs/gpu_protocol/VI_hammer_fusion.py	Wed Dec 02 17:08:49 2015 -0600
@@ -176,20 +176,21 @@
     # Pagewalk cache
     # NOTE: We use a CPU L1 cache controller here. This is to facilatate MMU
     #       cache coherence (as the GPU L1 caches are incoherent without flushes
-    #       The L2 cache is small, and should have minimal affect on the 
+    #       The L2 cache is small, and should have minimal affect on the
     #       performance (see Section 6.2 of Power et al. HPCA 2014).
     pwd_cache = L1Cache(size = options.pwc_size,
                             assoc = 16, # 64 is fully associative @ 8kB
                             replacement_policy = "LRU",
                             start_index_bit = block_size_bits,
-                            latency = 8,
+                            latency = options.pwc_latency,
                             resourceStalls = False)
     # Small cache since CPU L1 requires I and D
     pwi_cache = L1Cache(size = "512B",
                             assoc = 2,
                             replacement_policy = "LRU",
                             start_index_bit = block_size_bits,
-                            latency = 8,
+                            
+latency = 8,
                             resourceStalls = False)
 
     # Small cache since CPU L1 controller requires L2
@@ -198,13 +199,17 @@
                            start_index_bit = block_size_bits,
                            latency = 1,
                            resourceStalls = False)
+    if (options.pw_l2_latency == 0):
+        m_issue_latency = l1_to_l2_noc_latency
+    else:
+        m_issue_latency = options.pw_l2_latency
 
     l1_cntrl = L1Cache_Controller(version = options.num_cpus,
                                   L1Icache = pwi_cache,
                                   L1Dcache = pwd_cache,
                                   L2cache = l2_cache,
                                   send_evictions = False,
-                                  issue_latency = l1_to_l2_noc_latency,
+                                  issue_latency = m_issue_latency, #this should be pwc_latency
                                   cache_response_latency = 1,
                                   l2_cache_hit_latency = 1,
                                   number_of_TBEs = options.gpu_l1_buf_depth,
diff -r ea23ae7dc3ff -r f3b35cc4f72d src/gpu/gpgpu-sim/cuda_gpu.cc
--- a/src/gpu/gpgpu-sim/cuda_gpu.cc	Wed Dec 02 17:08:49 2015 -0600
+++ b/src/gpu/gpgpu-sim/cuda_gpu.cc	Wed Dec 02 17:08:49 2015 -0600
@@ -751,11 +751,18 @@
         .name(name() + ".kernels_completed")
         .desc("Number of kernels completed")
         ;
+
+    shootdownTimes
+        .name(name() + ".shootdown_times")
+        .desc("Times to shootdown")
+        .init(8)
+        ;
 }
 
 void CudaGPU::TLBShootdownEvent::process()
 {
     DPRINTF(CudaGPU, "Processing shootdown!\n");
+    assert(gpu->running);
 
     switch(stage) {
     case Stage::Idle:
@@ -765,6 +772,7 @@
         }
         stage = Stage::Pausing;
         gpu->schedule(this, gpu->nextCycle());
+        gpu->shootdownStartTick = curTick();
         break;
     case Stage::Pausing:
         DPRINTF(CudaGPU, "Shootdown: Flushing cores\n");
@@ -797,6 +805,7 @@
         }
         stage = Stage::Idle;
         // NO need to schedule anything
+        gpu->shootdownTimes.sample(curTick()-gpu->shootdownStartTick);
         break;
     default:
         panic("Unexpected current shootdown stage");
diff -r ea23ae7dc3ff -r f3b35cc4f72d src/gpu/gpgpu-sim/cuda_gpu.hh
--- a/src/gpu/gpgpu-sim/cuda_gpu.hh	Wed Dec 02 17:08:49 2015 -0600
+++ b/src/gpu/gpgpu-sim/cuda_gpu.hh	Wed Dec 02 17:08:49 2015 -0600
@@ -347,6 +347,7 @@
 
     bool testShootdown;
     Tick shootdownTick;
+    Tick shootdownStartTick;
 
   public:
     /// Constructor
@@ -485,6 +486,7 @@
 
     /// Statistics for this GPU
     Stats::Scalar numKernelsCompleted;
+    Stats::Histogram shootdownTimes;
     void regStats();
 };
 
diff -r ea23ae7dc3ff -r f3b35cc4f72d src/gpu/shader_tlb.cc
--- a/src/gpu/shader_tlb.cc	Wed Dec 02 17:08:49 2015 -0600
+++ b/src/gpu/shader_tlb.cc	Wed Dec 02 17:08:49 2015 -0600
@@ -197,8 +197,8 @@
         return;
     }
     int way = (vpn / TheISA::PageBytes) % ways;
-    GPUTlbEntry* entry = NULL;
-    Tick minTick = curTick();
+    GPUTlbEntry* entry = &entries[way][0];
+    Tick minTick = entries[way][0].mruTick;
     for (int i=0; i < sets; i++) {
         if (entries[way][i].free) {
             entry = &entries[way][i];
diff -r ea23ae7dc3ff -r f3b35cc4f72d src/mem/protocol/MOESI_hammer-GPUcache.sm
--- a/src/mem/protocol/MOESI_hammer-GPUcache.sm	Wed Dec 02 17:08:49 2015 -0600
+++ b/src/mem/protocol/MOESI_hammer-GPUcache.sm	Wed Dec 02 17:08:49 2015 -0600
@@ -155,6 +155,7 @@
     DataBlock DataBlk,       desc="data for the block";
     bool FromL2, default="false", desc="block just moved from L2";
     bool AtomicAccessed, default="false", desc="block just moved from L2";
+    Address VAddr,             desc="Virtual address associated with block";
   }
 
   // TBE fields
@@ -167,6 +168,7 @@
     bool AppliedSilentAcks, default="false", desc="for full-bit dir, does the pending msg count reflect the silent acks";
     MachineID LastResponder, desc="last machine to send a response for this request";
     MachineID CurOwner,      desc="current owner of the block, used for UnblockS responses";
+    Address VAddr,           desc="Virtual address associated with block";
 
     Cycles InitialRequestTime, default="Cycles(0)",
             desc="time the initial requests was sent from the L1Cache";
@@ -977,6 +979,7 @@
     tbe.DataBlk := cache_entry.DataBlk; // Data only used for writebacks
     tbe.Dirty := cache_entry.Dirty;
     tbe.Sharers := false;
+    tbe.VAddr := cache_entry.VAddr;
   }
 
   action(it_allocateTBE, "it", desc="Allocate TBE") {
@@ -1004,6 +1007,7 @@
     assert(is_valid(tbe));
     cache_entry.Dirty   := tbe.Dirty;
     cache_entry.DataBlk := tbe.DataBlk;
+    cache_entry.VAddr := tbe.VAddr;
   }
 
   action(nb_copyFromTBEToL1, "fu", desc="Copy data from TBE to L1 cache entry.") {
@@ -1012,6 +1016,7 @@
     cache_entry.Dirty   := tbe.Dirty;
     cache_entry.DataBlk := tbe.DataBlk;
     cache_entry.FromL2 := true;
+    cache_entry.VAddr := tbe.VAddr;
   }
 
   action(m_decrementNumberOfMessages, "m", desc="Decrement the number of messages for which we're waiting") {
@@ -1102,6 +1107,7 @@
         DPRINTF(RubySlicc, "%s\n", out_msg.Destination);
         out_msg.DataBlk := tbe.DataBlk;
         out_msg.Dirty := tbe.Dirty;
+        out_msg.VAddr := tbe.VAddr;
         if (in_msg.DirectedProbe) {
           out_msg.Acks := machineCount(MachineType:L1Cache);
         } else {
@@ -1128,6 +1134,7 @@
         DPRINTF(RubySlicc, "%s\n", out_msg.Destination);
         out_msg.DataBlk := tbe.DataBlk;
         out_msg.Dirty := tbe.Dirty;
+        out_msg.VAddr := tbe.VAddr;
         if (in_msg.DirectedProbe) {
           out_msg.Acks := machineCount(MachineType:L1Cache);
         } else {
@@ -1153,6 +1160,7 @@
         DPRINTF(RubySlicc, "%s\n", out_msg.Destination);
         out_msg.DataBlk := tbe.DataBlk;
         out_msg.Dirty := tbe.Dirty;
+        out_msg.VAddr := tbe.VAddr;
         out_msg.Acks := machineCount(MachineType:L1Cache);
         out_msg.SilentAcks := in_msg.SilentAcks;
         out_msg.MessageSize := MessageSizeType:Response_Data;
@@ -1174,6 +1182,7 @@
       }
       out_msg.OriginalDestination.add(map_Address_to_Directory(address));
       out_msg.Dirty := tbe.Dirty;
+      out_msg.VAddr := tbe.VAddr;
       if (tbe.Dirty) {
         out_msg.Type := CoherenceResponseType:WB_DIRTY;
         out_msg.DataBlk := tbe.DataBlk;
@@ -1206,6 +1215,7 @@
       out_msg.Destination.add(map_Address_to_Directory(address));
       out_msg.DataBlk := tbe.DataBlk;
       out_msg.Dirty := tbe.Dirty;
+      out_msg.VAddr := tbe.VAddr;
       if (tbe.Dirty) {
         out_msg.Type := CoherenceResponseType:WB_EXCLUSIVE_DIRTY;
         out_msg.DataBlk := tbe.DataBlk;
@@ -1270,12 +1280,18 @@
   action(ii_allocateL1DCacheBlock, "\i", desc="Set L1 D-cache tag equal to tag of block B.") {
     if (is_invalid(cache_entry)) {
       set_cache_entry(L1Dcache.allocate(address, new Entry));
+      peek(mandatoryQueue_in, RubyRequest){
+        cache_entry.VAddr := in_msg.VirtualAddress;
+      }
     }
   }
 
   action(jj_allocateL1ICacheBlock, "\j", desc="Set L1 I-cache tag equal to tag of block B.") {
     if (is_invalid(cache_entry)) {
       set_cache_entry(L1Icache.allocate(address, new Entry));
+      peek(mandatoryQueue_in, RubyRequest) {
+        cache_entry.VAddr := in_msg.VirtualAddress;
+      }
     }
   }
 
diff -r ea23ae7dc3ff -r f3b35cc4f72d src/mem/protocol/MOESI_hammer_bcu-BCU.sm
--- a/src/mem/protocol/MOESI_hammer_bcu-BCU.sm	Wed Dec 02 17:08:49 2015 -0600
+++ b/src/mem/protocol/MOESI_hammer_bcu-BCU.sm	Wed Dec 02 17:08:49 2015 -0600
@@ -34,6 +34,9 @@
 
 structure (PermissionTable, external="yes") {
     void logAddress(Address, bool);
+    void logAddressRead(Address);
+    void logAddressData(Address);
+    void logAddressCoherence(Address);
     bool checkPLB(Address, bool);
     bool checkTable(Address, bool);
 }
@@ -131,6 +134,7 @@
             /*Do we need read permission*/
             if (in_msg.Type == CoherenceResponseType:ACK_SHARED){
                 perm_table.logAddress(in_msg.Addr, false);
+                perm_table.logAddressCoherence(in_msg.Addr);
                 bool ret := perm_table.checkPLB(in_msg.Addr, false);
                 if (ret != true) {
                     perm_table.checkTable(in_msg.Addr, false);
@@ -144,7 +148,8 @@
                 /*With this coherence protocol, hard to tell if this was an
                 innocent read that got O or what. Use dirty bit.*/
                 perm_table.logAddress(in_msg.Addr, true);
-                bool ret := perm_table.checkPLB(in_msg.Addr, in_msg.Dirty);
+                perm_table.logAddressData(in_msg.Addr);
+                bool ret := perm_table.checkPLB(in_msg.Addr, true);
                 if (ret != true) {
                     perm_table.checkTable(in_msg.Addr, true);
                     latency := miss_latency;
@@ -179,6 +184,7 @@
                 in_msg.Type == CoherenceResponseType:UNBLOCKM ||
                 in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_CLEAN){ 
                 perm_table.logAddress(in_msg.Addr, false);
+                perm_table.logAddressCoherence(in_msg.Addr);
                 bool ret := perm_table.checkPLB(in_msg.Addr, false);
                 if (ret != true) {
                     perm_table.checkTable(in_msg.Addr, false);
@@ -189,6 +195,7 @@
             else if (in_msg.Type == CoherenceResponseType:WB_DIRTY ||
              in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_DIRTY){
                 perm_table.logAddress(in_msg.Addr, true);
+                perm_table.logAddressData(in_msg.Addr);
                 bool ret := perm_table.checkPLB(in_msg.Addr, true);
                 if (ret != true) {
                     perm_table.checkTable(in_msg.Addr, true);
@@ -215,29 +222,31 @@
     action(rqc_reqtodir, "rqc", desc="RequestFromCache") {
         peek(requestFromCache_in, RequestMsg) {
             DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr);
-        Cycles latency := hit_latency;
-        /*Do we need read permission*/
-        if (in_msg.Type == CoherenceRequestType:GETX || 
-            in_msg.Type == CoherenceRequestType:GETS || 
-            in_msg.Type == CoherenceRequestType:MERGED_GETS || 
-            in_msg.Type == CoherenceRequestType:GETF){
-            perm_table.logAddress(in_msg.Addr, false);
-            bool ret := perm_table.checkPLB(in_msg.Addr, false);
-            if (ret != true) {
-                perm_table.checkTable(in_msg.Addr, false);
-                latency := miss_latency;
+            Cycles latency := hit_latency;
+            /*Do we need read permission*/
+            if (in_msg.Type == CoherenceRequestType:GETX || 
+                in_msg.Type == CoherenceRequestType:GETS || 
+                in_msg.Type == CoherenceRequestType:MERGED_GETS || 
+                in_msg.Type == CoherenceRequestType:GETF){
+                perm_table.logAddress(in_msg.Addr, false);
+                perm_table.logAddressRead(in_msg.Addr);
+                bool ret := perm_table.checkPLB(in_msg.Addr, false);
+                if (ret != true) {
+                    perm_table.checkTable(in_msg.Addr, false);
+                    latency := miss_latency;
+                }
             }
-        }
-        /*Do we need write permission*/
-        else if (in_msg.Type == CoherenceRequestType:PUT || 
-             in_msg.Type == CoherenceRequestType:PUTF){
-            perm_table.logAddress(in_msg.Addr, true);
-            bool ret := perm_table.checkPLB(in_msg.Addr, true);
-            if (ret != true) {
-                perm_table.checkTable(in_msg.Addr, true);
-                latency := miss_latency;
+            /*Do we need write permission*/
+            else if (in_msg.Type == CoherenceRequestType:PUT || 
+                 in_msg.Type == CoherenceRequestType:PUTF){
+                perm_table.logAddress(in_msg.Addr, true);
+                perm_table.logAddressCoherence(in_msg.Addr);
+                bool ret := perm_table.checkPLB(in_msg.Addr, true);
+                if (ret != true) {
+                    perm_table.checkTable(in_msg.Addr, true);
+                    latency := miss_latency;
+                }
             }
-        }
             enqueue(requestNetwork_out, RequestMsg, latency) {
                 out_msg.Addr := in_msg.Addr;
                 out_msg.Type := in_msg.Type;
diff -r ea23ae7dc3ff -r f3b35cc4f72d src/mem/protocol/MOESI_hammer_bcu-msg.sm
--- a/src/mem/protocol/MOESI_hammer_bcu-msg.sm	Wed Dec 02 17:08:49 2015 -0600
+++ b/src/mem/protocol/MOESI_hammer_bcu-msg.sm	Wed Dec 02 17:08:49 2015 -0600
@@ -95,6 +95,7 @@
   NetDest OriginalDestination,     desc="Multicast destination mask";
   MessageSizeType MessageSize, desc="size category of the message";
   bool DirectedProbe, default="false", desc="probe filter directed probe";
+  Address VAddr,            desc="Virtual address for this request";
 
   Cycles InitialRequestTime, default="Cycles(0)",
         desc="time the initial requests was sent from the L1Cache";
@@ -125,6 +126,7 @@
   bool Dirty,                  desc="Is the data dirty (different than memory)?";
   int Acks, default="0",    desc="How many messages this counts as";
   MessageSizeType MessageSize, desc="size category of the message";
+  Address VAddr,                desc="Virtual address for this request";
 
   Cycles InitialRequestTime, default="Cycles(0)",
         desc="time the initial requests was sent from the L1Cache";
diff -r ea23ae7dc3ff -r f3b35cc4f72d src/mem/ruby/PermissionTable.cc
--- a/src/mem/ruby/PermissionTable.cc	Wed Dec 02 17:08:49 2015 -0600
+++ b/src/mem/ruby/PermissionTable.cc	Wed Dec 02 17:08:49 2015 -0600
@@ -55,6 +55,22 @@
         DPRINTF(PermissionTable, "Addr %#x, %s\n", addr.getAddress(), isWrite ? "W" : "R");
 }
 
+void PermissionTable::logAddressData(Address addr)
+{
+    m_data_message++;
+}
+
+
+void PermissionTable::logAddressRead(Address addr)
+{
+    m_read_message++;
+}
+
+void PermissionTable::logAddressCoherence(Address addr)
+{
+    m_coherence_message++;
+}
+
 /* After ATS translates address, insert it into permission table & cache.
  * Should only be called from ATS (shaderMMU)
  * Returns true if anything was inserted, since then a memory request
@@ -74,10 +90,8 @@
         //not in PLB - insert it
         hasWrite ? m_plb_write_insert_miss++ : m_plb_read_insert_miss++;
         if (plb_size > 0){
-            //update with this entry
-            entry.first = plb_tag;
-            //initialize vector
-            entry.second = std::vector<bool>(plb_entry_size * 2, false);
+            m_table_read++;
+            entry = getTableBlock(plb_tag);
             entry.second[offset] = true;
             entry.second[offset + 1] = hasWrite;
 
@@ -144,6 +158,20 @@
     return true;
 }
 
+/* Flush the PermissionTable and PLB */
+void PermissionTable::flush(){
+    m_plb_flushes++;
+
+    //Clear the table
+    std::fill(permission_bitmap.begin(), permission_bitmap.end(), false);
+
+    //Clear the PLB
+    m_plb_flushed_blocks += plb.size();
+    plb.clear();
+    
+
+}
+
 /*Helper functions for dealing with PLB*/
 /* requires pre-shifted address (tag) */
 PermissionTable::plb_entry PermissionTable::popPLBEntry(Address plb_tag){
@@ -324,6 +352,31 @@
         .name(name() + ".table_write")
         .desc("Number of permission table writes")
         ;
+
+    m_data_message
+        .name(name() + ".data_message")
+        .desc("Number of messages carrying WB data")
+        ;
+
+    m_read_message        
+        .name(name() + ".read_message")
+        .desc("Number of messages carrying requests for data")
+        ;
+
+    m_coherence_message        
+        .name(name() + ".coherence_message")
+        .desc("Number of other coherence messages")
+        ;
+
+    m_plb_flushes
+        .name(name() + ".num_plb_flushes")
+        .desc("Number of PLB flushes")
+        ;
+
+    m_plb_flushed_blocks
+        .name(name() + ".num_plb_flushed_blocks")
+        .desc("Number of flushed PLB blocks")
+        ;
 }
 
 PermissionTable *
diff -r ea23ae7dc3ff -r f3b35cc4f72d src/mem/ruby/PermissionTable.hh
--- a/src/mem/ruby/PermissionTable.hh	Wed Dec 02 17:08:49 2015 -0600
+++ b/src/mem/ruby/PermissionTable.hh	Wed Dec 02 17:08:49 2015 -0600
@@ -48,11 +48,16 @@
     PermissionTable(const Params *p);
 
     void logAddress(Address addr, bool isWrite);
+    void logAddressData(Address addr);
+    void logAddressRead(Address addr);
+    void logAddressCoherence(Address addr);
 
     bool insert(Addr page_addr, bool hasWrite);
 
     bool checkPLB(Address addr, bool isWriteback);
 
+    void flush();
+
     bool checkTable(Address addr, bool isWriteback);
 
     // What 64-byte block is this address found in?
@@ -115,6 +120,13 @@
     Stats::Scalar m_table_read;
     Stats::Scalar m_table_write;
 
+    Stats::Scalar m_data_message;
+    Stats::Scalar m_read_message;
+    Stats::Scalar m_coherence_message;
+
+    Stats::Scalar m_plb_flushes;
+    Stats::Scalar m_plb_flushed_blocks;
+
 };
 
 #endif // __PERMISSION_TABLE_HH__