# HG changeset patch # User Lena Olson # Date 1449097727 21600 # Node ID 6fa05c2ac585027a388950a6285d0c395ca9ab76 # Parent 39259afef4564d564f4aefd2b22cff2e17110c55 imported patch fix-regress diff -r 39259afef456 -r 6fa05c2ac585 tests/regress.py --- a/tests/regress.py Fri Sep 19 00:22:52 2014 -0500 +++ b/tests/regress.py Wed Dec 02 17:08:47 2015 -0600 @@ -182,7 +182,7 @@ scons_opts += ' --ignore-style --no-lto EXTRAS=../gem5-gpu/src:../gpgpu-sim' for target in targets: - cmd = 'scons %s --default=../../gem5-gpu/build_opts/%s %s' % \ + cmd = 'python /usr/bin/scons %s --default=../../gem5-gpu/build_opts/%s %s' % \ (scons_opts, target[0], target[1]) print "Building/Running scons command: %s\n" % cmd if options.no_exec: # HG changeset patch # User Lena Olson # Date 1449097727 21600 # Node ID 2db8c41103a182f25d137cabed9e724b01be6878 # Parent 6fa05c2ac585027a388950a6285d0c395ca9ab76 imported patch fixes-for-gcc4.8 diff -r 6fa05c2ac585 -r 2db8c41103a1 src/gpu/shader_tlb.hh --- a/src/gpu/shader_tlb.hh Wed Dec 02 17:08:47 2015 -0600 +++ b/src/gpu/shader_tlb.hh Wed Dec 02 17:08:47 2015 -0600 @@ -54,6 +54,7 @@ public: virtual bool lookup(Addr vpn, Addr& ppn, bool set_mru=true) = 0; virtual void insert(Addr vpn, Addr ppn) = 0; + virtual ~BaseTLBMemory() {} }; class TLBMemory : public BaseTLBMemory { diff -r 6fa05c2ac585 -r 2db8c41103a1 src/mem/ruby/RubySlicc_GPUMappings.hh --- a/src/mem/ruby/RubySlicc_GPUMappings.hh Wed Dec 02 17:08:47 2015 -0600 +++ b/src/mem/ruby/RubySlicc_GPUMappings.hh Wed Dec 02 17:08:47 2015 -0600 @@ -40,7 +40,7 @@ inline MachineID getL2ID(Address addr, int num_l2, int select_bits, int select_start_bit) { - int num = 0; + unsigned int num = 0; if (select_bits) { if (num_l2 > pow(2, select_bits)) fatal("Number of GPU L2 select bits set incorrectly?"); # HG changeset patch # User Lena Olson # Date 1449097727 21600 # Node ID dbcd4579a673931bb7f07c9328c8d66fc37f179f # Parent 2db8c41103a182f25d137cabed9e724b01be6878 Updates to work with gem5 version 10451:3a87241adfb8 Updates head file placement and slightly changed TLB::finish params diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/MESI_Two_Level_fusion.py --- a/configs/gpu_protocol/MESI_Two_Level_fusion.py Wed Dec 02 17:08:47 2015 -0600 +++ b/configs/gpu_protocol/MESI_Two_Level_fusion.py Wed Dec 02 17:08:47 2015 -0600 @@ -100,6 +100,8 @@ options.cpu_type == "detailed"), prefetcher = prefetcher, ruby_system = ruby_system, + #clk_domain=system.cpu[i].clk_domain, + transitions_per_cycle=options.ports, enable_prefetch = False) cpu_seq = RubySequencer(version = options.num_cpus + i, @@ -107,6 +109,7 @@ dcache = l1d_cache, access_phys_mem = True, max_outstanding_requests = options.gpu_l1_buf_depth, + #clk_domain=system.cpu[i].clk_domain, ruby_system = ruby_system, connect_to_io = False) @@ -120,6 +123,13 @@ cpu_sequencers.append(cpu_seq) topology.addController(l1_cntrl) + l1_cntrl.requestFromL1Cache = ruby_system.network.slave + l1_cntrl.responseFromL1Cache = ruby_system.network.slave + l1_cntrl.unblockFromL1Cache = ruby_system.network.slave + + l1_cntrl.requestToL1Cache = ruby_system.network.master + l1_cntrl.responseToL1Cache = ruby_system.network.master + cntrl_count += 1 ############################################################################ @@ -170,6 +180,13 @@ topology.addController(l1_cntrl) + l1_cntrl.requestFromL1Cache = ruby_system.network.slave + l1_cntrl.responseFromL1Cache = ruby_system.network.slave + l1_cntrl.unblockFromL1Cache = ruby_system.network.slave + + l1_cntrl.requestToL1Cache = ruby_system.network.master + l1_cntrl.responseToL1Cache = ruby_system.network.master + # Copy engine cache (make as small as possible, ideally 0) l1i_cache = L1Cache(size = "2kB", assoc = 2) @@ -205,4 +222,11 @@ cpu_sequencers.append(cpu_seq) topology.addController(l1_cntrl) + l1_cntrl.requestFromL1Cache = ruby_system.network.slave + l1_cntrl.responseFromL1Cache = ruby_system.network.slave + l1_cntrl.unblockFromL1Cache = ruby_system.network.slave + + l1_cntrl.requestToL1Cache = ruby_system.network.master + l1_cntrl.responseToL1Cache = ruby_system.network.master + return (cpu_sequencers, dir_cntrls, topology) diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/MI_example_fusion.py --- a/configs/gpu_protocol/MI_example_fusion.py Wed Dec 02 17:08:47 2015 -0600 +++ b/configs/gpu_protocol/MI_example_fusion.py Wed Dec 02 17:08:47 2015 -0600 @@ -100,6 +100,12 @@ cpu_sequencers.append(cpu_seq) topology.addController(l1_cntrl) + # Connect the L1 controllers and the network + l1_cntrl.requestFromCache = ruby_system.network.slave + l1_cntrl.responseFromCache = ruby_system.network.slave + l1_cntrl.forwardToCache = ruby_system.network.master + l1_cntrl.responseToCache = ruby_system.network.master + ############################################################################ # Pagewalk cache # NOTE: We use a CPU L1 cache controller here. This is to facilatate MMU @@ -136,6 +142,12 @@ topology.addController(l1_cntrl) + # Connect the L1 controllers and the network + l1_cntrl.requestFromCache = ruby_system.network.slave + l1_cntrl.responseFromCache = ruby_system.network.slave + l1_cntrl.forwardToCache = ruby_system.network.master + l1_cntrl.responseToCache = ruby_system.network.master + #copy engine cache (make as small as possible, ideally 0) cache = Cache(size = "4kB", assoc = 2) @@ -164,4 +176,10 @@ cpu_sequencers.append(cpu_seq) topology.addController(l1_cntrl) + # Connect the L1 controllers and the network + l1_cntrl.requestFromCache = ruby_system.network.slave + l1_cntrl.responseFromCache = ruby_system.network.slave + l1_cntrl.forwardToCache = ruby_system.network.master + l1_cntrl.responseToCache = ruby_system.network.master + return cpu_sequencers, dir_cntrls, topology diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/MOESI_hammer_fusion.py --- a/configs/gpu_protocol/MOESI_hammer_fusion.py Wed Dec 02 17:08:47 2015 -0600 +++ b/configs/gpu_protocol/MOESI_hammer_fusion.py Wed Dec 02 17:08:47 2015 -0600 @@ -94,6 +94,8 @@ options.allow_atomic_migration, send_evictions = ( options.cpu_type == "detailed"), + transitions_per_cycle = options.ports, + #clk_domain=system.cpu[i].clk_domain, ruby_system = ruby_system) cpu_seq = RubySequencer(version = options.num_cpus + i, @@ -101,10 +103,13 @@ dcache = l1d_cache, access_phys_mem = True, max_outstanding_requests = options.gpu_l1_buf_depth, + #clk_domain=system.cpu[i].clk_domain, ruby_system = ruby_system, connect_to_io = False) l1_cntrl.sequencer = cpu_seq + if options.recycle_latency: + l1_cntrl.recycle_latency = options.recycle_latency exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i) @@ -114,6 +119,16 @@ cpu_sequencers.append(cpu_seq) topology.addController(l1_cntrl) + # Connect the L1 controller and the network + # Connect the buffers from the controller to network + l1_cntrl.requestFromCache = ruby_system.network.slave + l1_cntrl.responseFromCache = ruby_system.network.slave + l1_cntrl.unblockFromCache = ruby_system.network.slave + + # Connect the buffers from the network to the controller + l1_cntrl.forwardToCache = ruby_system.network.master + l1_cntrl.responseToCache = ruby_system.network.master + cntrl_count += 1 ############################################################################ @@ -169,6 +184,16 @@ topology.addController(l1_cntrl) + # Connect the L1 controller and the network + # Connect the buffers from the controller to network + l1_cntrl.requestFromCache = ruby_system.network.slave + l1_cntrl.responseFromCache = ruby_system.network.slave + l1_cntrl.unblockFromCache = ruby_system.network.slave + + # Connect the buffers from the network to the controller + l1_cntrl.forwardToCache = ruby_system.network.master + l1_cntrl.responseToCache = ruby_system.network.master + # Copy engine cache (make as small as possible, ideally 0) l1i_cache = L1Cache(size = "2kB", assoc = 2) l1d_cache = L1Cache(size = "2kB", assoc = 2) @@ -204,4 +229,14 @@ cpu_sequencers.append(cpu_seq) topology.addController(l1_cntrl) + # Connect the L1 controller and the network + # Connect the buffers from the controller to network + l1_cntrl.requestFromCache = ruby_system.network.slave + l1_cntrl.responseFromCache = ruby_system.network.slave + l1_cntrl.unblockFromCache = ruby_system.network.slave + + # Connect the buffers from the network to the controller + l1_cntrl.forwardToCache = ruby_system.network.master + l1_cntrl.responseToCache = ruby_system.network.master + return (cpu_sequencers, dir_cntrl_nodes, topology) diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/MOESI_hammer_split.py --- a/configs/gpu_protocol/MOESI_hammer_split.py Wed Dec 02 17:08:47 2015 -0600 +++ b/configs/gpu_protocol/MOESI_hammer_split.py Wed Dec 02 17:08:47 2015 -0600 @@ -102,4 +102,14 @@ cpu_sequencers.append(gpu_ce_seq) topology.addController(l1_cntrl) + # Connect the L1 controller and the network + # Connect the buffers from the controller to network + l1_cntrl.requestFromCache = ruby_system.network.slave + l1_cntrl.responseFromCache = ruby_system.network.slave + l1_cntrl.unblockFromCache = ruby_system.network.slave + + # Connect the buffers from the network to the controller + l1_cntrl.forwardToCache = ruby_system.network.master + l1_cntrl.responseToCache = ruby_system.network.master + return (cpu_sequencers, dir_cntrl_nodes, topology) diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/VI_hammer.py --- a/configs/gpu_protocol/VI_hammer.py Wed Dec 02 17:08:47 2015 -0600 +++ b/configs/gpu_protocol/VI_hammer.py Wed Dec 02 17:08:47 2015 -0600 @@ -121,6 +121,16 @@ cpu_sequencers.append(cpu_seq) topology.add(l1_cntrl) + # Connect the L1 controller and the network + # Connect the buffers from the controller to network + l1_cntrl.requestFromCache = ruby_system.network.slave + l1_cntrl.responseFromCache = ruby_system.network.slave + l1_cntrl.unblockFromCache = ruby_system.network.slave + + # Connect the buffers from the network to the controller + l1_cntrl.forwardToCache = ruby_system.network.master + l1_cntrl.responseToCache = ruby_system.network.master + cpu_mem_range = AddrRange(options.total_mem_size) mem_module_size = cpu_mem_range.size() / options.num_dirs @@ -183,6 +193,16 @@ exec("ruby_system.dir_cntrl%d = dir_cntrl" % i) dir_cntrl_nodes.append(dir_cntrl) + # Connect the directory controller to the network + dir_cntrl.forwardFromDir = ruby_system.network.slave + dir_cntrl.responseFromDir = ruby_system.network.slave + dir_cntrl.dmaResponseFromDir = ruby_system.network.slave + + dir_cntrl.unblockToDir = ruby_system.network.master + dir_cntrl.responseToDir = ruby_system.network.master + dir_cntrl.requestToDir = ruby_system.network.master + dir_cntrl.dmaRequestToDir = ruby_system.network.master + dma_cntrl_nodes = [] for i, dma_port in enumerate(dma_ports): # @@ -202,4 +222,8 @@ if options.recycle_latency: dma_cntrl.recycle_latency = options.recycle_latency + # Connect the dma controller to the network + dma_cntrl.responseFromDir = ruby_system.network.master + dma_cntrl.requestToDir = ruby_system.network.slave + return (cpu_sequencers, dir_cntrl_nodes, dma_cntrl_nodes, topology) diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/VI_hammer_fusion.py --- a/configs/gpu_protocol/VI_hammer_fusion.py Wed Dec 02 17:08:47 2015 -0600 +++ b/configs/gpu_protocol/VI_hammer_fusion.py Wed Dec 02 17:08:47 2015 -0600 @@ -120,6 +120,11 @@ all_sequencers.append(gpu_seq) gpu_cluster.add(l1_cntrl) + # Connect the controller to the network + l1_cntrl.requestFromL1Cache = ruby_system.network.slave + l1_cntrl.atomicRequestFromL1Cache = ruby_system.network.slave + l1_cntrl.responseToL1Cache = ruby_system.network.master + l2_index_start = block_size_bits + l2_bits # Use L2 cache and interconnect latencies to calculate protocol latencies # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles @@ -155,6 +160,17 @@ gpu_cluster.add(l2_cluster) l2_clusters.append(l2_cluster) + # Connect the controller to the network + l2_cntrl.responseToL1Cache = ruby_system.network.slave + l2_cntrl.requestFromCache = ruby_system.network.slave + l2_cntrl.responseFromCache = ruby_system.network.slave + l2_cntrl.unblockFromCache = ruby_system.network.slave + + l2_cntrl.requestFromL1Cache = ruby_system.network.master + l2_cntrl.atomicRequestFromL1Cache = ruby_system.network.master + l2_cntrl.forwardToCache = ruby_system.network.master + l2_cntrl.responseToCache = ruby_system.network.master + ############################################################################ # Pagewalk cache # NOTE: We use a CPU L1 cache controller here. This is to facilatate MMU @@ -210,6 +226,16 @@ gpu_cluster.add(l1_cntrl) + # Connect the L1 controller and the network + # Connect the buffers from the controller to network + l1_cntrl.requestFromCache = ruby_system.network.slave + l1_cntrl.responseFromCache = ruby_system.network.slave + l1_cntrl.unblockFromCache = ruby_system.network.slave + + # Connect the buffers from the network to the controller + l1_cntrl.forwardToCache = ruby_system.network.master + l1_cntrl.responseToCache = ruby_system.network.master + # # Create controller for the copy engine to connect to in GPU cluster @@ -235,6 +261,9 @@ all_sequencers.append(gpu_ce_seq) + gpu_ce_cntrl.responseFromDir = ruby_system.network.master + gpu_ce_cntrl.reqToDirectory = ruby_system.network.slave + complete_cluster = Cluster(intBW = 32, extBW = 32) complete_cluster.add(gpu_ce_cntrl) complete_cluster.add(cpu_cluster) diff -r 2db8c41103a1 -r dbcd4579a673 configs/gpu_protocol/VI_hammer_split.py --- a/configs/gpu_protocol/VI_hammer_split.py Wed Dec 02 17:08:47 2015 -0600 +++ b/configs/gpu_protocol/VI_hammer_split.py Wed Dec 02 17:08:47 2015 -0600 @@ -151,6 +151,11 @@ all_sequencers.append(gpu_seq) gpu_cluster.add(l1_cntrl) + # Connect the controller to the network + l1_cntrl.requestFromL1Cache = ruby_system.network.master + l1_cntrl.atomicRequestFromL1Cache = ruby_system.network.master + l1_cntrl.responseToL1Cache = ruby_system.network.slave + l2_index_start = block_size_bits + l2_bits # Use L2 cache and interconnect latencies to calculate protocol latencies # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles @@ -186,6 +191,16 @@ gpu_cluster.add(l2_cluster) l2_clusters.append(l2_cluster) + # Connect the controller to the network + l2_cntrl.responseToL1Cache = ruby_system.network.master + l2_cntrl.requestFromCache = ruby_system.network.master + l2_cntrl.responseFromCache = ruby_system.network.master + l2_cntrl.unblockFromCache = ruby_system.network.master + l2_cntrl.requestFromL1Cache = ruby_system.network.slave + l2_cntrl.atomicRequestFromL1Cache = ruby_system.network.slave + l2_cntrl.forwardToCache = ruby_system.network.slave + l2_cntrl.responseToCache = ruby_system.network.slave + gpu_phys_mem_size = system.gpu_physmem.range.size() if options.num_dev_dirs > 0: @@ -254,6 +269,16 @@ exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i) dir_cntrls.append(dev_dir_cntrl) + + # Connect the directory controller to the network + dir_cntrl.forwardFromDir = ruby_system.network.slave + dir_cntrl.responseFromDir = ruby_system.network.slave + dir_cntrl.dmaResponseFromDir = ruby_system.network.slave + + dir_cntrl.unblockToDir = ruby_system.network.master + dir_cntrl.responseToDir = ruby_system.network.master + dir_cntrl.requestToDir = ruby_system.network.master + dir_cntrl.dmaRequestToDir = ruby_system.network.master else: # Since there are no device directories, use CPU directories # Fix up the memory sizes of the CPU directories @@ -288,6 +313,9 @@ all_sequencers.append(cpu_ce_seq) all_sequencers.append(gpu_ce_seq) + gpu_ce_cntrl.responseFromDir = ruby_system.network.slave + gpu_ce_cntrl.reqToDirectory = ruby_system.network.master + complete_cluster = Cluster(intBW = 32, extBW = 32) complete_cluster.add(cpu_ce_cntrl) complete_cluster.add(gpu_ce_cntrl) diff -r 2db8c41103a1 -r dbcd4579a673 src/gpu/shader_mmu.hh --- a/src/gpu/shader_mmu.hh Wed Dec 02 17:08:47 2015 -0600 +++ b/src/gpu/shader_mmu.hh Wed Dec 02 17:08:47 2015 -0600 @@ -99,7 +99,7 @@ BaseTLB::Mode _mode, ThreadContext *_tc, bool prefetch=false); void markDelayed() { wrappedTranslation->markDelayed(); } - void finish(Fault fault, RequestPtr _req, ThreadContext *_tc, + void finish(const Fault &fault, RequestPtr _req, ThreadContext *_tc, BaseTLB::Mode _mode) { assert(_mode == mode); diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/protocol/VI-ce.sm --- a/src/mem/protocol/VI-ce.sm Wed Dec 02 17:08:47 2015 -0600 +++ b/src/mem/protocol/VI-ce.sm Wed Dec 02 17:08:47 2015 -0600 @@ -1,12 +1,14 @@ machine(GPUCopyDMA, "VI Copy Engine Controller") -: Sequencer * sequencer, - Cycles request_latency = 6 +: Sequencer * sequencer; + Cycles request_latency := 6; + + MessageBuffer * responseFromDir, network="From", virtual_network="1", + ordered="true", vnet_type="response"; + MessageBuffer * reqToDirectory, network="To", virtual_network="0", + ordered="true", vnet_type="request"; + { - - MessageBuffer responseFromDir, network="From", virtual_network="1", ordered="true", vnet_type="response"; - MessageBuffer reqToDirectory, network="To", virtual_network="0", ordered="true", vnet_type="request"; - state_declaration(State, desc="CE states", default="GPUCopyDMA_State_READY") { READY, AccessPermission:Invalid, desc="Ready to accept a new request"; BUSY_RD, AccessPermission:Busy, desc="Busy: currently processing a request"; diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/protocol/VI_hammer-CPUCache.sm --- a/src/mem/protocol/VI_hammer-CPUCache.sm Wed Dec 02 17:08:47 2015 -0600 +++ b/src/mem/protocol/VI_hammer-CPUCache.sm Wed Dec 02 17:08:47 2015 -0600 @@ -34,25 +34,29 @@ */ machine({L1Cache, L2Cache}, "AMD Hammer-like protocol") -: Sequencer * sequencer, - CacheMemory * L1Icache, - CacheMemory * L1Dcache, - CacheMemory * L2cache, - Cycles cache_response_latency = 10, - Cycles issue_latency = 1, - Cycles l2_cache_hit_latency = 15, - bool no_mig_atomic = true, - bool send_evictions -{ +: Sequencer * sequencer; + CacheMemory * L1Icache; + CacheMemory * L1Dcache; + CacheMemory * L2cache; + Cycles cache_response_latency := 10; + Cycles issue_latency := 1; + Cycles l2_cache_hit_latency := 15; + bool no_mig_atomic := "True"; + bool send_evictions; // NETWORK BUFFERS - MessageBuffer requestFromCache, network="To", virtual_network="2", ordered="false", vnet_type="request"; - MessageBuffer responseFromCache, network="To", virtual_network="4", ordered="false", vnet_type="response"; - MessageBuffer unblockFromCache, network="To", virtual_network="5", ordered="false", vnet_type="unblock"; + MessageBuffer * requestFromCache, network="To", virtual_network="2", + ordered="false", vnet_type="request"; + MessageBuffer * responseFromCache, network="To", virtual_network="4", + ordered="false", vnet_type="response"; + MessageBuffer * unblockFromCache, network="To", virtual_network="5", + ordered="false", vnet_type="unblock"; - MessageBuffer forwardToCache, network="From", virtual_network="3", ordered="false", vnet_type="forward"; - MessageBuffer responseToCache, network="From", virtual_network="4", ordered="false", vnet_type="response"; - + MessageBuffer * forwardToCache, network="From", virtual_network="3", + ordered="false", vnet_type="forward"; + MessageBuffer * responseToCache, network="From", virtual_network="4", + ordered="false", vnet_type="response"; +{ // STATES state_declaration(State, desc="Cache states", default="L1Cache_State_I") { diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/protocol/VI_hammer-GPUL1cache.sm --- a/src/mem/protocol/VI_hammer-GPUL1cache.sm Wed Dec 02 17:08:47 2015 -0600 +++ b/src/mem/protocol/VI_hammer-GPUL1cache.sm Wed Dec 02 17:08:47 2015 -0600 @@ -1,18 +1,22 @@ machine(GPUL1Cache, "VI GPU L1 Cache") -: Sequencer * sequencer, - CacheMemory * cache, - int l2_select_num_bits, - int num_l2, - Cycles issue_latency = 416, -{ +: Sequencer * sequencer; + CacheMemory * cache; + int l2_select_num_bits; + int num_l2; + Cycles issue_latency := 416; + // NETWORK BUFFERS - MessageBuffer requestFromL1Cache, network="To", virtual_network="7", ordered="true", vnet_type="request"; - MessageBuffer atomicRequestFromL1Cache, network="To", virtual_network="8", ordered="true", vnet_type="request"; + MessageBuffer * requestFromL1Cache, network="To", virtual_network="7", + ordered="true", vnet_type="request"; + MessageBuffer * atomicRequestFromL1Cache, network="To", virtual_network="8", + ordered="true", vnet_type="request"; - MessageBuffer responseToL1Cache, network="From", virtual_network="6", ordered="true", vnet_type="response"; + MessageBuffer * responseToL1Cache, network="From", virtual_network="6", + ordered="true", vnet_type="response"; +{ // STATES state_declaration(State, desc="Cache states") { I, AccessPermission:Invalid, desc="Not Present/Invalid"; diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/protocol/VI_hammer-GPUL2cache.sm --- a/src/mem/protocol/VI_hammer-GPUL2cache.sm Wed Dec 02 17:08:47 2015 -0600 +++ b/src/mem/protocol/VI_hammer-GPUL2cache.sm Wed Dec 02 17:08:47 2015 -0600 @@ -1,28 +1,36 @@ machine(GPUL2Cache, "Simple write back L2 cache") - : CacheMemory * L2cache, - Cycles l2_request_latency = 260, - Cycles l2_response_latency = 2, - Cycles cache_response_latency = 260, -{ + : CacheMemory * L2cache; + Cycles l2_request_latency := 260; + Cycles l2_response_latency := 2; + Cycles cache_response_latency := 260; //Note: we might have a problem if two Get atomics arrive from different L1's at the same time // NETWORK BUFFERS // Buffers to and from L1 caches - MessageBuffer requestFromL1Cache, network="From", virtual_network="7", ordered="true", vnet_type="request"; - MessageBuffer responseToL1Cache, network="To", virtual_network="6", ordered="true", vnet_type="response"; - MessageBuffer atomicRequestFromL1Cache, network="From", virtual_network="8", ordered="true", vnet_type="request"; + MessageBuffer * requestFromL1Cache, network="From", virtual_network="7", + ordered="true", vnet_type="request"; + MessageBuffer * responseToL1Cache, network="To", virtual_network="6", + ordered="true", vnet_type="response"; + MessageBuffer * atomicRequestFromL1Cache, network="From", virtual_network="8", + ordered="true", vnet_type="request"; // Buffers to / from the dir and other caches - MessageBuffer requestFromCache, network="To", virtual_network="2", ordered="false", vnet_type="request"; - MessageBuffer responseFromCache, network="To", virtual_network="4", ordered="false", vnet_type="response"; - MessageBuffer unblockFromCache, network="To", virtual_network="5", ordered="false", vnet_type="unblock"; + MessageBuffer * requestFromCache, network="To", virtual_network="2", + ordered="false", vnet_type="request"; + MessageBuffer * responseFromCache, network="To", virtual_network="4", + ordered="false", vnet_type="response"; + MessageBuffer * unblockFromCache, network="To", virtual_network="5", + ordered="false", vnet_type="unblock"; - MessageBuffer forwardToCache, network="From", virtual_network="3", ordered="false", vnet_type="forward"; - MessageBuffer responseToCache, network="From", virtual_network="4", ordered="false", vnet_type="response"; + MessageBuffer * forwardToCache, network="From", virtual_network="3", + ordered="false", vnet_type="forward"; + MessageBuffer * responseToCache, network="From", virtual_network="4", + ordered="false", vnet_type="response"; +{ // STATES state_declaration(State, desc="Cache states") { I, AccessPermission:Invalid, desc="Idle"; diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/protocol/VI_hammer-dir.sm --- a/src/mem/protocol/VI_hammer-dir.sm Wed Dec 02 17:08:47 2015 -0600 +++ b/src/mem/protocol/VI_hammer-dir.sm Wed Dec 02 17:08:47 2015 -0600 @@ -34,28 +34,35 @@ */ machine(Directory, "AMD Hammer-like protocol") -: DirectoryMemory * directory, - CacheMemory * probeFilter, - MemoryControl * memBuffer, - Cycles memory_controller_latency = 12, - bool probe_filter_enabled = false, - bool full_bit_dir_enabled = false -{ +: DirectoryMemory * directory; + CacheMemory * probeFilter; + MemoryControl * memBuffer; + Cycles memory_controller_latency := 12; + bool probe_filter_enabled := "False"; + bool full_bit_dir_enabled := "False"; - MessageBuffer forwardFromDir, network="To", virtual_network="3", ordered="false", vnet_type="forward"; - MessageBuffer responseFromDir, network="To", virtual_network="4", ordered="false", vnet_type="response"; + MessageBuffer * forwardFromDir, network="To", virtual_network="3", + ordered="false", vnet_type="forward"; + MessageBuffer * responseFromDir, network="To", virtual_network="4", + ordered="false", vnet_type="response"; // // For a finite buffered network, note that the DMA response network only // works at this relatively lower numbered (lower priority) virtual network // because the trigger queue decouples cache responses from DMA responses. // - MessageBuffer dmaResponseFromDir, network="To", virtual_network="1", ordered="true", vnet_type="response"; + MessageBuffer * dmaResponseFromDir, network="To", virtual_network="1", + ordered="true", vnet_type="response"; - MessageBuffer unblockToDir, network="From", virtual_network="5", ordered="false", vnet_type="unblock"; - MessageBuffer responseToDir, network="From", virtual_network="4", ordered="false", vnet_type="response"; - MessageBuffer requestToDir, network="From", virtual_network="2", ordered="false", vnet_type="request", recycle_latency="1"; - MessageBuffer dmaRequestToDir, network="From", virtual_network="0", ordered="true", vnet_type="request"; + MessageBuffer * unblockToDir, network="From", virtual_network="5", + ordered="false", vnet_type="unblock"; + MessageBuffer * responseToDir, network="From", virtual_network="4", + ordered="false", vnet_type="response"; + MessageBuffer * requestToDir, network="From", virtual_network="2", + ordered="false", vnet_type="request", recycle_latency="1"; + MessageBuffer * dmaRequestToDir, network="From", virtual_network="0", + ordered="true", vnet_type="request"; +{ // STATES state_declaration(State, desc="Directory states", default="Directory_State_E") { // Base states diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/protocol/VI_hammer-dma.sm --- a/src/mem/protocol/VI_hammer-dma.sm Wed Dec 02 17:08:47 2015 -0600 +++ b/src/mem/protocol/VI_hammer-dma.sm Wed Dec 02 17:08:47 2015 -0600 @@ -28,13 +28,15 @@ machine(DMA, "DMA Controller") -: DMASequencer * dma_sequencer, - Cycles request_latency = 6 +: DMASequencer * dma_sequencer; + Cycles request_latency := 6; + + MessageBuffer * responseFromDir, network="From", virtual_network="1", + ordered="true", vnet_type="response", no_vector="true"; + MessageBuffer * reqToDirectory, network="To", virtual_network="0", + ordered="false", vnet_type="request", no_vector="true"; + { - - MessageBuffer responseFromDir, network="From", virtual_network="1", ordered="true", vnet_type="response", no_vector="true"; - MessageBuffer reqToDirectory, network="To", virtual_network="0", ordered="false", vnet_type="request", no_vector="true"; - state_declaration(State, desc="DMA states", default="DMA_State_READY") { diff -r 2db8c41103a1 -r dbcd4579a673 src/mem/ruby/RubySlicc_GPUMappings.hh --- a/src/mem/ruby/RubySlicc_GPUMappings.hh Wed Dec 02 17:08:47 2015 -0600 +++ b/src/mem/ruby/RubySlicc_GPUMappings.hh Wed Dec 02 17:08:47 2015 -0600 @@ -29,13 +29,14 @@ #ifndef __MEM_RUBY_SLICC_GPUMAPPINGS_HH__ #define __MEM_RUBY_SLICC_GPUMAPPINGS_HH__ -#include +#include + #include "mem/protocol/MachineType.hh" #include "mem/ruby/common/Address.hh" #include "mem/ruby/common/Global.hh" +#include "mem/ruby/common/MachineID.hh" #include "mem/ruby/common/NetDest.hh" -#include "mem/ruby/system/DirectoryMemory.hh" -#include "mem/ruby/system/MachineID.hh" +#include "mem/ruby/structures/DirectoryMemory.hh" inline MachineID getL2ID(Address addr, int num_l2, int select_bits, int select_start_bit) # HG changeset patch # User Lena Olson # Date 1449097728 21600 # Node ID 3ee9d80f490fad33e0fa9c18fdba958942d3c63b # Parent dbcd4579a673931bb7f07c9328c8d66fc37f179f Add the ability to do TLB shootdown. This also adds a hacky way to test TLB shootdown diff -r dbcd4579a673 -r 3ee9d80f490f configs/fs_fusion.py --- a/configs/fs_fusion.py Wed Dec 02 17:08:47 2015 -0600 +++ b/configs/fs_fusion.py Wed Dec 02 17:08:48 2015 -0600 @@ -141,6 +141,9 @@ system.gpu_physmem = SimpleMemory(range = gpu_mem_range) system.gpu_physmem.port = system.iobus.master +system.gpu.test_tlb_shootdown = True +system.gpu.tlb_shootdown_tick = 5461844154685 + (29247313 / 4) + # # Setup Ruby # diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/gpgpu-sim/CudaGPU.py --- a/src/gpu/gpgpu-sim/CudaGPU.py Wed Dec 02 17:08:47 2015 -0600 +++ b/src/gpu/gpgpu-sim/CudaGPU.py Wed Dec 02 17:08:48 2015 -0600 @@ -55,3 +55,7 @@ gpu_memory_range = Param.AddrRange(AddrRange('1kB'), "The address range for the GPU memory space") shader_mmu = Param.ShaderMMU(ShaderMMU(), "Memory managment unit for this GPU") + + test_tlb_shootdown = Param.Bool(False, "If true, insert a shootdown event") + tlb_shootdown_tick = Param.Tick(0, "Relative tick after restore to issue the shootdown") + diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/gpgpu-sim/cuda_core.cc --- a/src/gpu/gpgpu-sim/cuda_core.cc Wed Dec 02 17:08:47 2015 -0600 +++ b/src/gpu/gpgpu-sim/cuda_core.cc Wed Dec 02 17:08:48 2015 -0600 @@ -60,6 +60,10 @@ warpSize = cudaGPU->getWarpSize(); signalKernelFinish = false; + signalFlushFinish = false; + flushFinished = false; + + memoryPaused = false; if (p->port_lsq_port_connection_count != warpSize) { panic("Shader core lsq_port size != to warp size\n"); @@ -247,6 +251,11 @@ bool CudaCore::executeMemOp(const warp_inst_t &inst) { + if (memoryPaused) { + // return true: there should be a pipeline stall + return true; + } + assert(inst.space.get_type() == global_space || inst.space.get_type() == const_space || inst.op == BARRIER_OP || @@ -397,6 +406,12 @@ shaderImpl->finish_kernel(); signalKernelFinish = false; } + if (signalFlushFinish) { + flushFinished = true; + cudaGPU->cudaCoreFlushFinish(); + signalFlushFinish = false; + // NOTE: this signal flag will be reset by the cudaGPU. + } } else { panic("Received unhandled packet type in control port"); } @@ -427,6 +442,13 @@ } void +CudaCore::beginCoreFlush() +{ + signalFlushFinish = true; + flush(); +} + +void CudaCore::finishKernel() { numKernelsCompleted++; diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/gpgpu-sim/cuda_core.hh --- a/src/gpu/gpgpu-sim/cuda_core.hh Wed Dec 02 17:08:47 2015 -0600 +++ b/src/gpu/gpgpu-sim/cuda_core.hh Wed Dec 02 17:08:48 2015 -0600 @@ -180,6 +180,15 @@ // if true then need to signal GPGPU-Sim once cleanup is done bool signalKernelFinish; + // if true then need to signal the cudaGPU once the flush is finished + bool signalFlushFinish; + + // The flush has finished on the core, but not others. Cleared by cudaGPU + bool flushFinished; + + // if true, do not accept any new memory requests from the shader cores + bool memoryPaused; + // Returns the line of the address, a Addr addrToLine(Addr a); @@ -230,6 +239,12 @@ // Handle an instruction port retry request void handleRetry(); + /** + * Flush the core of all pending instructions, + * This is currently used to force the LSQ to flush on kernel end + */ + void flush(); + public: // Receive and complete an instruction fetch void recvInstResp(PacketPtr pkt); @@ -263,10 +278,25 @@ void writebackClear(); /** - * Flush the core of all pending instructions, - * This is currently used to force the LSQ to flush on kernel end + * Called from the cudaGPU when flushing all of the GPU state */ - void flush(); + void beginCoreFlush(); + + /** + * Return whether or not the flush is finished + */ + bool checkFlushFinish() { return flushFinished; } + + /** + * Clear the flushFinished flag. Called from cudaGPU. + */ + void clearFlushFinish() { flushFinished = false; } + + /** + * (Un)Pause all accesses to memory. Currently used during TLB shootdown + */ + void pauseMemory() { memoryPaused = true; } + void unpauseMemory() { memoryPaused = false; } /** * Called from GPGPU-Sim when a kernel completes on this shader diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/gpgpu-sim/cuda_gpu.cc --- a/src/gpu/gpgpu-sim/cuda_gpu.cc Wed Dec 02 17:08:47 2015 -0600 +++ b/src/gpu/gpgpu-sim/cuda_gpu.cc Wed Dec 02 17:08:48 2015 -0600 @@ -62,13 +62,15 @@ CudaGPU::CudaGPU(const Params *p) : ClockedObject(p), _params(p), gpuTickEvent(this, false), streamTickEvent(this, true), + tlbShootdownEvent(this), system(p->sys), warpSize(p->warp_size), sharedMemDelay(p->shared_mem_delay), gpgpusimConfigPath(p->config_path), launchDelay(p->kernel_launch_delay), returnDelay(p->kernel_return_delay), unblockNeeded(false), ruby(p->ruby), runningTC(NULL), runningStream(NULL), runningTID(-1), clearTick(0), dumpKernelStats(p->dump_kernel_stats), pageTable(), manageGPUMemory(p->manage_gpu_memory), - gpuMemoryRange(p->gpu_memory_range), shaderMMU(p->shader_mmu) + gpuMemoryRange(p->gpu_memory_range), shaderMMU(p->shader_mmu), + testShootdown(p->test_tlb_shootdown), shootdownTick(p->tlb_shootdown_tick) { // Register this device as a CUDA-enabled GPU cudaDeviceID = registerCudaDevice(this); @@ -239,6 +241,10 @@ (*iter)->initialize(); } + if (testShootdown) { + schedule(tlbShootdownEvent, shootdownTick); + } + if (!restoring) { return; } @@ -278,6 +284,11 @@ clearTick = curTick(); } +void CudaGPU::registerTLB(ShaderTLB *tlb) +{ + shaderTLBs.push_back(tlb); +} + void CudaGPU::registerCudaCore(CudaCore *sc) { cudaCores.push_back(sc); @@ -488,6 +499,26 @@ endStreamOperation(); } +void CudaGPU::cudaCoreFlushFinish() +{ + DPRINTF(CudaGPU, "Shootdown: Got a finish flush.\n"); + bool allDone = true; + for (auto it: cudaCores) { + if (!it->checkFlushFinish()) { + DPRINTF(CudaGPU, "Shootdown: Not done yet...\n"); + allDone = false; + break; + } + } + if (allDone) { + DPRINTF(CudaGPU, "Finally done!\n"); + for (auto it: cudaCores) { + it->clearFlushFinish(); + } + schedule(tlbShootdownEvent, nextCycle()); + } +} + // TODO: When we move the stream manager into libcuda, this will need to be // eliminated, and libcuda will have to decide when to block the calling thread bool CudaGPU::needsToBlock() @@ -722,6 +753,56 @@ ; } +void CudaGPU::TLBShootdownEvent::process() +{ + DPRINTF(CudaGPU, "Processing shootdown!\n"); + + switch(stage) { + case Stage::Idle: + DPRINTF(CudaGPU, "Shootdown: Pausing memory\n"); + for (auto it: gpu->cudaCores) { + it->pauseMemory(); + } + stage = Stage::Pausing; + gpu->schedule(this, gpu->nextCycle()); + break; + case Stage::Pausing: + DPRINTF(CudaGPU, "Shootdown: Flushing cores\n"); + for (auto it: gpu->cudaCores) { + it->beginCoreFlush(); + } + stage = Stage::FlushingL1s; + break; + case Stage::FlushingL1s: + DPRINTF(CudaGPU, "Shootdown: DONE flushing cores\n"); + gpu->schedule(this, gpu->nextCycle()); + stage = Stage::FlushingOthers; + break; + case Stage::FlushingOthers: + DPRINTF(CudaGPU, "Shootdown: Flushing others\n"); + DPRINTF(CudaGPU, "Shootdown: Flushing TLBs\n"); + for (auto it: gpu->shaderTLBs) { + it->flushAll(); + } + + DPRINTF(CudaGPU, "Shootdown: Flushing MMU\n"); + gpu->shaderMMU->flushAll(); + gpu->schedule(this, gpu->clockEdge(Cycles(5))); + stage = Stage::Unpausing; + break; + case Stage::Unpausing: + DPRINTF(CudaGPU, "Shootdown: Unpausing\n"); + for (auto it: gpu->cudaCores) { + it->unpauseMemory(); + } + stage = Stage::Idle; + // NO need to schedule anything + break; + default: + panic("Unexpected current shootdown stage"); + } +} + /** * virtual process function that is invoked when the callback * queue is executed. diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/gpgpu-sim/cuda_gpu.hh --- a/src/gpu/gpgpu-sim/cuda_gpu.hh Wed Dec 02 17:08:47 2015 -0600 +++ b/src/gpu/gpgpu-sim/cuda_gpu.hh Wed Dec 02 17:08:48 2015 -0600 @@ -38,6 +38,7 @@ #include "base/callback.hh" #include "debug/CudaGPUPageTable.hh" #include "gpgpu-sim/gpu-sim.h" +#include "gpu/shader_tlb.hh" #include "params/CudaGPU.hh" #include "sim/process.hh" #include "sim/system.hh" @@ -162,6 +163,21 @@ } }; + class TLBShootdownEvent : public Event + { + + private: + CudaGPU *gpu; + enum class Stage {Idle, Pausing, FlushingL1s, FlushingOthers, + Unpausing}; + Stage stage; + + public: + TLBShootdownEvent(CudaGPU *_gpu) : gpu(_gpu), stage(Stage::Idle) {} + void process(); + }; + friend class TLBShootdownEvent; + const CudaGPUParams *_params; const Params * params() const { return dynamic_cast(_params); } @@ -171,6 +187,8 @@ /// Tick for when the stream manager needs execute TickEvent streamTickEvent; + TLBShootdownEvent tlbShootdownEvent; + private: // The CUDA device ID for this GPU unsigned cudaDeviceID; @@ -212,6 +230,9 @@ /// Holds all of the CUDA cores in this GPU std::vector cudaCores; + /// Holds all of the GPU shader private TLBs + std::vector shaderTLBs; + /// The thread context, stream and thread ID currently running on the SPA ThreadContext *runningTC; struct CUstream_st *runningStream; @@ -324,6 +345,9 @@ CudaDeviceProperties deviceProperties; + bool testShootdown; + Tick shootdownTick; + public: /// Constructor CudaGPU(const Params *p); @@ -338,6 +362,7 @@ /// Register devices callbacks void registerCudaCore(CudaCore *sc); void registerCopyEngine(GPUCopyEngine *ce); + void registerTLB(ShaderTLB *tlb); /// Getter for whether we are using Ruby or GPGPU-Sim memory modeling CudaDeviceProperties *getDeviceProperties() { return &deviceProperties; } @@ -400,6 +425,9 @@ /// Called by the copy engine when a memcpy or memset is complete void finishCopyOperation(); + /// Called by the cuda cores when they finish flushing if signalFlush is set + void cudaCoreFlushFinish(); + /// Called from shader TLB to be used for TLB lookups /// TODO: Move the thread context handling to GPU context when we get there ThreadContext *getThreadContext() { return runningTC; } diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/shader_mmu.cc --- a/src/gpu/shader_mmu.cc Wed Dec 02 17:08:47 2015 -0600 +++ b/src/gpu/shader_mmu.cc Wed Dec 02 17:08:48 2015 -0600 @@ -484,6 +484,18 @@ } void +ShaderMMU::flushAll() +{ + assert(pendingWalks.empty()); + assert(outstandingWalks.empty()); + assert(pendingFaults.empty()); + if (tlb != nullptr) { + tlb->flushAll(); + } + prefetchBuffer.clear(); +} + +void ShaderMMU::regStats() { numPagefaults diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/shader_mmu.hh --- a/src/gpu/shader_mmu.hh Wed Dec 02 17:08:47 2015 -0600 +++ b/src/gpu/shader_mmu.hh Wed Dec 02 17:08:48 2015 -0600 @@ -172,6 +172,9 @@ /// Handle a page fault once it's done (called from CUDA API via CudaGPU) void handleFinishPageFault(ThreadContext *tc); + /// Flush any TLBs and others that needs to be flushed for TLB shootdown + void flushAll(); + void regStats(); Stats::Scalar numPagefaults; diff -r dbcd4579a673 -r 3ee9d80f490f src/gpu/shader_tlb.cc --- a/src/gpu/shader_tlb.cc Wed Dec 02 17:08:47 2015 -0600 +++ b/src/gpu/shader_tlb.cc Wed Dec 02 17:08:48 2015 -0600 @@ -53,6 +53,8 @@ tlbMemory = new InfiniteTLBMemory(); } mmu = cudaGPU->getMMU(); + + cudaGPU->registerTLB(this); } void @@ -165,7 +167,7 @@ void ShaderTLB::flushAll() { - panic("Flush all unimplemented"); + tlbMemory->flushAll(); } bool @@ -218,6 +220,18 @@ } void +TLBMemory::flushAll() +{ + for (int i=0; i < ways; i++) { + for (int j=0; j # Date 1449097728 21600 # Node ID 7b001aa001f007c6af4ab6ddfcf2f3b491b108d9 # Parent 3ee9d80f490fad33e0fa9c18fdba958942d3c63b This is adding slicc files to implement the BCU, but it seems like the wrong way to go diff -r 3ee9d80f490f -r 7b001aa001f0 configs/gpu_protocol/MOESI_hammer_bcu.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/configs/gpu_protocol/MOESI_hammer_bcu.py Wed Dec 02 17:08:48 2015 -0600 @@ -0,0 +1,2 @@ +# Almost empty file to trick ruby into working +from MOESI_hammer import * \ No newline at end of file diff -r 3ee9d80f490f -r 7b001aa001f0 configs/gpu_protocol/MOESI_hammer_bcu_fusion.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/configs/gpu_protocol/MOESI_hammer_bcu_fusion.py Wed Dec 02 17:08:48 2015 -0600 @@ -0,0 +1,258 @@ +# Copyright (c) 2006-2007 The Regents of The University of Michigan +# Copyright (c) 2009 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Brad Beckmann + +import math +import m5 +from m5.objects import * +from m5.defines import buildEnv +from Ruby import create_topology + +# +# Note: the L1 Cache latency is only used by the sequencer on fast path hits +# +class L1Cache(RubyCache): + latency = 1 + +# +# Note: the L2 Cache latency is not currently used +# +class L2Cache(RubyCache): + latency = 10 + +def create_system(options, system, dma_ports, ruby_system): + + if not buildEnv['GPGPU_SIM']: + m5.util.panic("This script requires GPGPU-Sim integration to be built.") + + print "Creating system for GPU" + + # Run the original protocol script + buildEnv['PROTOCOL'] = buildEnv['PROTOCOL'][:-11] + protocol = buildEnv['PROTOCOL'] + exec "import %s" % protocol + try: + (cpu_sequencers, dir_cntrl_nodes, topology) = \ + eval("%s.create_system(options, system, dma_ports, ruby_system)" % protocol) + except: + print "Error: could not create system for ruby protocol inside fusion system %s" % protocol + raise + + # + # Must create the individual controllers before the network to ensure the + # controller constructors are called before the network constructor + # + block_size_bits = int(math.log(options.cacheline_size, 2)) + + cntrl_count = 0 + + for i in xrange(options.num_sc): + # + # First create the Ruby objects associated with this cpu + # + l1i_cache = L1Cache(size = options.l1i_size, + assoc = options.l1i_assoc, + start_index_bit = block_size_bits, + is_icache = True) + l1d_cache = L1Cache(size = options.l1d_size, + assoc = options.l1d_assoc, + start_index_bit = block_size_bits) + l2_cache = L2Cache(size = options.l2_size, + assoc = options.l2_assoc, + start_index_bit = block_size_bits) + + l1_cntrl = L1Cache_Controller(version = options.num_cpus+i, + L1Icache = l1i_cache, + L1Dcache = l1d_cache, + L2cache = l2_cache, + no_mig_atomic = not \ + options.allow_atomic_migration, + send_evictions = ( + options.cpu_type == "detailed"), + transitions_per_cycle = options.ports, + #clk_domain=system.cpu[i].clk_domain, + is_gpu = True, + ruby_system = ruby_system) + + cpu_seq = RubySequencer(version = options.num_cpus + i, + icache = l1i_cache, + dcache = l1d_cache, + access_phys_mem = True, + max_outstanding_requests = options.gpu_l1_buf_depth, + #clk_domain=system.cpu[i].clk_domain, + ruby_system = ruby_system, + connect_to_io = False) + + l1_cntrl.sequencer = cpu_seq + if options.recycle_latency: + l1_cntrl.recycle_latency = options.recycle_latency + + exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i) + + # + # Add controllers and sequencers to the appropriate lists + # + cpu_sequencers.append(cpu_seq) + topology.addController(l1_cntrl) + + # Connect the L1 controller and the network + # Connect the buffers from the controller to network + l1_cntrl.requestFromCache = ruby_system.network.slave + l1_cntrl.responseFromCache = ruby_system.network.slave + l1_cntrl.unblockFromCache = ruby_system.network.slave + + # Connect the buffers from the network to the controller + l1_cntrl.forwardToCache = ruby_system.network.master + l1_cntrl.responseToCache = ruby_system.network.master + + cntrl_count += 1 + + ############################################################################ + # Pagewalk cache + # NOTE: We use a CPU L1 cache controller here. This is to facilatate MMU + # cache coherence (as the GPU L1 caches are incoherent without flushes + # The L2 cache is small, and should have minimal affect on the + # performance (see Section 6.2 of Power et al. HPCA 2014). + pwd_cache = L1Cache(size = options.pwc_size, + assoc = 16, # 64 is fully associative @ 8kB + replacement_policy = "LRU", + start_index_bit = block_size_bits, + latency = 8, + resourceStalls = False) + # Small cache since CPU L1 requires I and D + pwi_cache = L1Cache(size = "512B", + assoc = 2, + replacement_policy = "LRU", + start_index_bit = block_size_bits, + latency = 8, + resourceStalls = False) + # Small cache since CPU L1 controller requires L2 + l2_cache = L2Cache(size = "512B", + assoc = 2, + start_index_bit = block_size_bits, + latency = 1, + resourceStalls = False) + + l1_cntrl = L1Cache_Controller(version = options.num_cpus + options.num_sc, + L1Icache = pwi_cache, + L1Dcache = pwd_cache, + L2cache = l2_cache, + send_evictions = False, + cache_response_latency = 1, + l2_cache_hit_latency = 1, + number_of_TBEs = options.gpu_l1_buf_depth, + ruby_system = ruby_system) + + cpu_seq = RubySequencer(version = options.num_cpus + options.num_sc, + icache = pwd_cache, # Never get data from pwi_cache + dcache = pwd_cache, + access_phys_mem = True, + max_outstanding_requests = options.gpu_l1_buf_depth, + ruby_system = ruby_system, + deadlock_threshold = 2000000, + connect_to_io = False) + + l1_cntrl.sequencer = cpu_seq + + + ruby_system.l1_pw_cntrl = l1_cntrl + cpu_sequencers.append(cpu_seq) + + topology.addController(l1_cntrl) + + # Connect the L1 controller and the network + # Connect the buffers from the controller to network + l1_cntrl.requestFromCache = ruby_system.network.slave + l1_cntrl.responseFromCache = ruby_system.network.slave + l1_cntrl.unblockFromCache = ruby_system.network.slave + + # Connect the buffers from the network to the controller + l1_cntrl.forwardToCache = ruby_system.network.master + l1_cntrl.responseToCache = ruby_system.network.master + + # Copy engine cache (make as small as possible, ideally 0) + l1i_cache = L1Cache(size = "2kB", assoc = 2) + l1d_cache = L1Cache(size = "2kB", assoc = 2) + l2_cache = L2Cache(size = "2kB", + assoc = 2, + start_index_bit = block_size_bits) + + l1_cntrl = L1Cache_Controller(version = options.num_cpus+options.num_sc+1, + L1Icache = l1i_cache, + L1Dcache = l1d_cache, + L2cache = l2_cache, + no_mig_atomic = not \ + options.allow_atomic_migration, + send_evictions = ( + options.cpu_type == "detailed"), + ruby_system = ruby_system) + + # + # Only one unified L1 cache exists. Can cache instructions and data. + # + cpu_seq = RubySequencer(version = options.num_cpus + options.num_sc + 1, + icache = l1i_cache, + dcache = l1d_cache, + access_phys_mem = True, + max_outstanding_requests = 64, + ruby_system = ruby_system, + connect_to_io = False) + + l1_cntrl.sequencer = cpu_seq + + ruby_system.l1_cntrl_ce = l1_cntrl + + cpu_sequencers.append(cpu_seq) + topology.addController(l1_cntrl) + + # Connect the L1 controller and the network + # Connect the buffers from the controller to network + l1_cntrl.requestFromCache = ruby_system.network.slave + l1_cntrl.responseFromCache = ruby_system.network.slave + l1_cntrl.unblockFromCache = ruby_system.network.slave + + # Connect the buffers from the network to the controller + l1_cntrl.forwardToCache = ruby_system.network.master + l1_cntrl.responseToCache = ruby_system.network.master + + # BCU + cntrl = BorderControlUnit_Controller(version = 0, + ruby_system = ruby_system) + ruby_system.bcu_cntrl = cntrl + topology.addController(cntrl) + + cntrl.unblockToDirFromCache = ruby_system.network.master + cntrl.responseToDirFromCache = ruby_system.network.master + cntrl.requestToDirFromCache = ruby_system.network.master + + # Connect the buffers from the network to the controller + cntrl.requestFromCacheToDir = ruby_system.network.slave + cntrl.responseFromCacheToDir = ruby_system.network.slave + cntrl.unblockFromCacheToDir = ruby_system.network.slave + + return (cpu_sequencers, dir_cntrl_nodes, topology) diff -r 3ee9d80f490f -r 7b001aa001f0 src/mem/protocol/MOESI_hammer-GPUcache.sm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/mem/protocol/MOESI_hammer-GPUcache.sm Wed Dec 02 17:08:48 2015 -0600 @@ -0,0 +1,2212 @@ +/* + * Copyright (c) 1999-2013 Mark D. Hill and David A. Wood + * Copyright (c) 2009 Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * AMD's contributions to the MOESI hammer protocol do not constitute an + * endorsement of its similarity to any AMD products. + * + * Authors: Milo Martin + * Brad Beckmann + */ + +machine({L1Cache, L2Cache}, "AMD Hammer-like protocol") + : Sequencer * sequencer; + CacheMemory * L1Icache; + CacheMemory * L1Dcache; + CacheMemory * L2cache; + Cycles cache_response_latency := 10; + Cycles issue_latency := 2; + Cycles l2_cache_hit_latency := 10; + bool is_gpu := "False"; + bool no_mig_atomic := "True"; + bool send_evictions; + + // NETWORK BUFFERS + MessageBuffer * requestFromCache, network="To", virtual_network="2", + ordered="false", vnet_type="request"; + MessageBuffer * responseFromCache, network="To", virtual_network="4", + ordered="false", vnet_type="response"; + MessageBuffer * unblockFromCache, network="To", virtual_network="5", + ordered="false", vnet_type="unblock"; + + MessageBuffer * forwardToCache, network="From", virtual_network="3", + ordered="false", vnet_type="forward"; + MessageBuffer * responseToCache, network="From", virtual_network="4", + ordered="false", vnet_type="response"; + +{ + + // STATES + state_declaration(State, desc="Cache states", default="L1Cache_State_I") { + // Base states + I, AccessPermission:Invalid, desc="Idle"; + S, AccessPermission:Read_Only, desc="Shared"; + O, AccessPermission:Read_Only, desc="Owned"; + M, AccessPermission:Read_Only, desc="Modified (dirty)"; + MM, AccessPermission:Read_Write, desc="Modified (dirty and locally modified)"; + + // Base states, locked and ready to service the mandatory queue + IR, AccessPermission:Invalid, desc="Idle"; + SR, AccessPermission:Read_Only, desc="Shared"; + OR, AccessPermission:Read_Only, desc="Owned"; + MR, AccessPermission:Read_Only, desc="Modified (dirty)"; + MMR, AccessPermission:Read_Write, desc="Modified (dirty and locally modified)"; + + // Transient States + IM, AccessPermission:Busy, "IM", desc="Issued GetX"; + SM, AccessPermission:Read_Only, "SM", desc="Issued GetX, we still have a valid copy of the line"; + OM, AccessPermission:Read_Only, "OM", desc="Issued GetX, received data"; + ISM, AccessPermission:Read_Only, "ISM", desc="Issued GetX, received valid data, waiting for all acks"; + M_W, AccessPermission:Read_Only, "M^W", desc="Issued GetS, received exclusive data"; + MM_W, AccessPermission:Read_Write, "MM^W", desc="Issued GetX, received exclusive data"; + IS, AccessPermission:Busy, "IS", desc="Issued GetS"; + SS, AccessPermission:Read_Only, "SS", desc="Issued GetS, received data, waiting for all acks"; + OI, AccessPermission:Busy, "OI", desc="Issued PutO, waiting for ack"; + MI, AccessPermission:Busy, "MI", desc="Issued PutX, waiting for ack"; + II, AccessPermission:Busy, "II", desc="Issued PutX/O, saw Other_GETS or Other_GETX, waiting for ack"; + IT, AccessPermission:Busy, "IT", desc="Invalid block transferring to L1"; + ST, AccessPermission:Busy, "ST", desc="S block transferring to L1"; + OT, AccessPermission:Busy, "OT", desc="O block transferring to L1"; + MT, AccessPermission:Busy, "MT", desc="M block transferring to L1"; + MMT, AccessPermission:Busy, "MMT", desc="MM block transferring to L0"; + + //Transition States Related to Flushing + MI_F, AccessPermission:Busy, "MI_F", desc="Issued PutX due to a Flush, waiting for ack"; + MM_F, AccessPermission:Busy, "MM_F", desc="Issued GETF due to a Flush, waiting for ack"; + IM_F, AccessPermission:Busy, "IM_F", desc="Issued GetX due to a Flush"; + ISM_F, AccessPermission:Read_Only, "ISM_F", desc="Issued GetX, received data, waiting for all acks"; + SM_F, AccessPermission:Read_Only, "SM_F", desc="Issued GetX, we still have an old copy of the line"; + OM_F, AccessPermission:Read_Only, "OM_F", desc="Issued GetX, received data"; + MM_WF, AccessPermission:Busy, "MM_WF", desc="Issued GetX, received exclusive data"; + } + + // EVENTS + enumeration(Event, desc="Cache events") { + Load, desc="Load request from the processor"; + Ifetch, desc="I-fetch request from the processor"; + Store, desc="Store request from the processor"; + L2_Replacement, desc="L2 Replacement"; + L1_to_L2, desc="L1 to L2 transfer"; + Trigger_L2_to_L1D, desc="Trigger L2 to L1-Data transfer"; + Trigger_L2_to_L1I, desc="Trigger L2 to L1-Instruction transfer"; + Complete_L2_to_L1, desc="L2 to L1 transfer completed"; + + // Requests + Other_GETX, desc="A GetX from another processor"; + Other_GETS, desc="A GetS from another processor"; + Merged_GETS, desc="A Merged GetS from another processor"; + Other_GETS_No_Mig, desc="A GetS from another processor"; + NC_DMA_GETS, desc="special GetS when only DMA exists"; + Invalidate, desc="Invalidate block"; + + // Responses + Ack, desc="Received an ack message"; + Shared_Ack, desc="Received an ack message, responder has a shared copy"; + Data, desc="Received a data message"; + Shared_Data, desc="Received a data message, responder has a shared copy"; + Exclusive_Data, desc="Received a data message, responder had an exclusive copy, they gave it to us"; + + Writeback_Ack, desc="Writeback O.K. from directory"; + Writeback_Nack, desc="Writeback not O.K. from directory"; + + // Triggers + All_acks, desc="Received all required data and message acks"; + All_acks_no_sharers, desc="Received all acks and no other processor has a shared copy"; + + // For Flush + Flush_line, desc="flush the cache line from all caches"; + Block_Ack, desc="the directory is blocked and ready for the flush"; + } + + // TYPES + + // STRUCTURE DEFINITIONS + + MessageBuffer mandatoryQueue, ordered="false"; + + // CacheEntry + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + DataBlock DataBlk, desc="data for the block"; + bool FromL2, default="false", desc="block just moved from L2"; + bool AtomicAccessed, default="false", desc="block just moved from L2"; + } + + // TBE fields + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; + bool Sharers, desc="On a GetS, did we find any other sharers in the system"; + bool AppliedSilentAcks, default="false", desc="for full-bit dir, does the pending msg count reflect the silent acks"; + MachineID LastResponder, desc="last machine to send a response for this request"; + MachineID CurOwner, desc="current owner of the block, used for UnblockS responses"; + + Cycles InitialRequestTime, default="Cycles(0)", + desc="time the initial requests was sent from the L1Cache"; + Cycles ForwardRequestTime, default="Cycles(0)", + desc="time the dir forwarded the request"; + Cycles FirstResponseTime, default="Cycles(0)", + desc="the time the first response was received"; + } + + structure(TBETable, external="yes") { + TBE lookup(Address); + void allocate(Address); + void deallocate(Address); + bool isPresent(Address); + } + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Address a); + Cycles curCycle(); + + Entry getCacheEntry(Address address), return_by_pointer="yes" { + Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address)); + if(is_valid(L2cache_entry)) { + return L2cache_entry; + } + + Entry L1Dcache_entry := static_cast(Entry, "pointer", L1Dcache.lookup(address)); + if(is_valid(L1Dcache_entry)) { + return L1Dcache_entry; + } + + Entry L1Icache_entry := static_cast(Entry, "pointer", L1Icache.lookup(address)); + return L1Icache_entry; + } + + DataBlock getDataBlock(Address addr), return_by_ref="yes" { + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return cache_entry.DataBlk; + } + + TBE tbe := TBEs[addr]; + if(is_valid(tbe)) { + return tbe.DataBlk; + } + + error("Missing data block"); + } + + Entry getL2CacheEntry(Address address), return_by_pointer="yes" { + Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address)); + return L2cache_entry; + } + + Entry getL1DCacheEntry(Address address), return_by_pointer="yes" { + Entry L1Dcache_entry := static_cast(Entry, "pointer", L1Dcache.lookup(address)); + return L1Dcache_entry; + } + + Entry getL1ICacheEntry(Address address), return_by_pointer="yes" { + Entry L1Icache_entry := static_cast(Entry, "pointer", L1Icache.lookup(address)); + return L1Icache_entry; + } + + State getState(TBE tbe, Entry cache_entry, Address addr) { + if(is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Address addr, State state) { + assert((L1Dcache.isTagPresent(addr) && L1Icache.isTagPresent(addr)) == false); + assert((L1Icache.isTagPresent(addr) && L2cache.isTagPresent(addr)) == false); + assert((L1Dcache.isTagPresent(addr) && L2cache.isTagPresent(addr)) == false); + + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + AccessPermission getAccessPermission(Address addr) { + TBE tbe := TBEs[addr]; + if(is_valid(tbe)) { + return L1Cache_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return L1Cache_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Address addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(L1Cache_State_to_permission(state)); + } + } + + Event mandatory_request_type_to_event(RubyRequestType type) { + if (type == RubyRequestType:LD) { + return Event:Load; + } else if (type == RubyRequestType:IFETCH) { + return Event:Ifetch; + } else if ((type == RubyRequestType:ST) || (type == RubyRequestType:ATOMIC)) { + return Event:Store; + } else if ((type == RubyRequestType:FLUSH)) { + return Event:Flush_line; + } else { + error("Invalid RubyRequestType"); + } + } + + MachineType testAndClearLocalHit(Entry cache_entry) { + if (is_valid(cache_entry) && cache_entry.FromL2) { + cache_entry.FromL2 := false; + return MachineType:L2Cache; + } + return MachineType:L1Cache; + } + + bool IsAtomicAccessed(Entry cache_entry) { + assert(is_valid(cache_entry)); + return cache_entry.AtomicAccessed; + } + + MessageBuffer triggerQueue, ordered="false"; + + // ** OUT_PORTS ** + + out_port(requestNetwork_out, RequestMsg, requestFromCache); + out_port(responseNetwork_out, ResponseMsg, responseFromCache); + out_port(unblockNetwork_out, ResponseMsg, unblockFromCache); + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + + // ** IN_PORTS ** + + // Trigger Queue + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=3) { + if (triggerQueue_in.isReady()) { + peek(triggerQueue_in, TriggerMsg) { + + Entry cache_entry := getCacheEntry(in_msg.Addr); + TBE tbe := TBEs[in_msg.Addr]; + + if (in_msg.Type == TriggerType:L2_to_L1) { + trigger(Event:Complete_L2_to_L1, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == TriggerType:ALL_ACKS) { + trigger(Event:All_acks, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == TriggerType:ALL_ACKS_NO_SHARERS) { + trigger(Event:All_acks_no_sharers, in_msg.Addr, cache_entry, tbe); + } else { + error("Unexpected message"); + } + } + } + } + + // Nothing from the unblock network + + // Response Network + in_port(responseToCache_in, ResponseMsg, responseToCache, rank=2) { + if (responseToCache_in.isReady()) { + peek(responseToCache_in, ResponseMsg, block_on="Addr") { + + Entry cache_entry := getCacheEntry(in_msg.Addr); + TBE tbe := TBEs[in_msg.Addr]; + + if (in_msg.Type == CoherenceResponseType:ACK) { + trigger(Event:Ack, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:ACK_SHARED) { + trigger(Event:Shared_Ack, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:DATA) { + trigger(Event:Data, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:DATA_SHARED) { + trigger(Event:Shared_Data, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE) { + trigger(Event:Exclusive_Data, in_msg.Addr, cache_entry, tbe); + } else { + error("Unexpected message"); + } + } + } + } + + // Forward Network + in_port(forwardToCache_in, RequestMsg, forwardToCache, rank=1) { + if (forwardToCache_in.isReady()) { + peek(forwardToCache_in, RequestMsg, block_on="Addr") { + + Entry cache_entry := getCacheEntry(in_msg.Addr); + TBE tbe := TBEs[in_msg.Addr]; + + if ((in_msg.Type == CoherenceRequestType:GETX) || + (in_msg.Type == CoherenceRequestType:GETF)) { + trigger(Event:Other_GETX, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:MERGED_GETS) { + trigger(Event:Merged_GETS, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:GETS) { + if (machineCount(MachineType:L1Cache) > 1) { + if (is_valid(cache_entry)) { + if (IsAtomicAccessed(cache_entry) && no_mig_atomic) { + trigger(Event:Other_GETS_No_Mig, in_msg.Addr, cache_entry, tbe); + } else { + trigger(Event:Other_GETS, in_msg.Addr, cache_entry, tbe); + } + } else { + trigger(Event:Other_GETS, in_msg.Addr, cache_entry, tbe); + } + } else { + trigger(Event:NC_DMA_GETS, in_msg.Addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:INV) { + trigger(Event:Invalidate, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WB_ACK) { + trigger(Event:Writeback_Ack, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WB_NACK) { + trigger(Event:Writeback_Nack, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:BLOCK_ACK) { + trigger(Event:Block_Ack, in_msg.Addr, cache_entry, tbe); + } else { + error("Unexpected message"); + } + } + } + } + + // Nothing from the request network + + // Mandatory Queue + in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...", rank=0) { + if (mandatoryQueue_in.isReady()) { + peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { + + // Check for data access to blocks in I-cache and ifetchs to blocks in D-cache + TBE tbe := TBEs[in_msg.LineAddress]; + + if (in_msg.Type == RubyRequestType:IFETCH) { + // ** INSTRUCTION ACCESS *** + + Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress); + if (is_valid(L1Icache_entry)) { + // The tag matches for the L1, so the L1 fetches the line. + // We know it can't be in the L2 due to exclusion + trigger(mandatory_request_type_to_event(in_msg.Type), + in_msg.LineAddress, L1Icache_entry, tbe); + } else { + // Check to see if it is in the OTHER L1 + Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress); + if (is_valid(L1Dcache_entry)) { + // The block is in the wrong L1, try to write it to the L2 + if (L2cache.cacheAvail(in_msg.LineAddress)) { + trigger(Event:L1_to_L2, in_msg.LineAddress, L1Dcache_entry, tbe); + } else { + Address l2_victim_addr := L2cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L2_Replacement, + l2_victim_addr, + getL2CacheEntry(l2_victim_addr), + TBEs[l2_victim_addr]); + } + } + + if (L1Icache.cacheAvail(in_msg.LineAddress)) { + // L1 does't have the line, but we have space for it in the L1 + + Entry L2cache_entry := getL2CacheEntry(in_msg.LineAddress); + if (is_valid(L2cache_entry)) { + // L2 has it (maybe not with the right permissions) + trigger(Event:Trigger_L2_to_L1I, in_msg.LineAddress, + L2cache_entry, tbe); + } else { + // We have room, the L2 doesn't have it, so the L1 fetches the line + trigger(mandatory_request_type_to_event(in_msg.Type), + in_msg.LineAddress, L1Icache_entry, tbe); + } + } else { + // No room in the L1, so we need to make room + Address l1i_victim_addr := L1Icache.cacheProbe(in_msg.LineAddress); + if (L2cache.cacheAvail(l1i_victim_addr)) { + // The L2 has room, so we move the line from the L1 to the L2 + trigger(Event:L1_to_L2, + l1i_victim_addr, + getL1ICacheEntry(l1i_victim_addr), + TBEs[l1i_victim_addr]); + } else { + Address l2_victim_addr := L2cache.cacheProbe(l1i_victim_addr); + // The L2 does not have room, so we replace a line from the L2 + trigger(Event:L2_Replacement, + l2_victim_addr, + getL2CacheEntry(l2_victim_addr), + TBEs[l2_victim_addr]); + } + } + } + } else { + // *** DATA ACCESS *** + + Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress); + if (is_valid(L1Dcache_entry)) { + // The tag matches for the L1, so the L1 fetches the line. + // We know it can't be in the L2 due to exclusion + trigger(mandatory_request_type_to_event(in_msg.Type), + in_msg.LineAddress, L1Dcache_entry, tbe); + } else { + + // Check to see if it is in the OTHER L1 + Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress); + if (is_valid(L1Icache_entry)) { + // The block is in the wrong L1, try to write it to the L2 + if (L2cache.cacheAvail(in_msg.LineAddress)) { + trigger(Event:L1_to_L2, in_msg.LineAddress, L1Icache_entry, tbe); + } else { + Address l2_victim_addr := L2cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L2_Replacement, + l2_victim_addr, + getL2CacheEntry(l2_victim_addr), + TBEs[l2_victim_addr]); + } + } + + if (L1Dcache.cacheAvail(in_msg.LineAddress)) { + // L1 does't have the line, but we have space for it in the L1 + Entry L2cache_entry := getL2CacheEntry(in_msg.LineAddress); + if (is_valid(L2cache_entry)) { + // L2 has it (maybe not with the right permissions) + trigger(Event:Trigger_L2_to_L1D, in_msg.LineAddress, + L2cache_entry, tbe); + } else { + // We have room, the L2 doesn't have it, so the L1 fetches the line + trigger(mandatory_request_type_to_event(in_msg.Type), + in_msg.LineAddress, L1Dcache_entry, tbe); + } + } else { + // No room in the L1, so we need to make room + Address l1d_victim_addr := L1Dcache.cacheProbe(in_msg.LineAddress); + if (L2cache.cacheAvail(l1d_victim_addr)) { + // The L2 has room, so we move the line from the L1 to the L2 + trigger(Event:L1_to_L2, + l1d_victim_addr, + getL1DCacheEntry(l1d_victim_addr), + TBEs[l1d_victim_addr]); + } else { + Address l2_victim_addr := L2cache.cacheProbe(l1d_victim_addr); + // The L2 does not have room, so we replace a line from the L2 + trigger(Event:L2_Replacement, + l2_victim_addr, + getL2CacheEntry(l2_victim_addr), + TBEs[l2_victim_addr]); + } + } + } + } + } + } + } + + // ACTIONS + + action(a_issueGETS, "a", desc="Issue GETS") { + enqueue(requestNetwork_out, RequestMsg, issue_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceRequestType:GETS; + out_msg.Requestor := machineID; + if (is_gpu) { + DPRINTF(RubySlicc, "Setting up the broadcast\n"); + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + } else { + out_msg.Destination.add(map_Address_to_Directory(address)); + } + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + + // One from each other cache (n-1) plus the memory (+1) + tbe.NumPendingMsgs := machineCount(MachineType:L1Cache); + } + } + + action(b_issueGETX, "b", desc="Issue GETX") { + enqueue(requestNetwork_out, RequestMsg, issue_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceRequestType:GETX; + out_msg.Requestor := machineID; + if (is_gpu) { + DPRINTF(RubySlicc, "Setting up the broadcast\n"); + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + } else { + out_msg.Destination.add(map_Address_to_Directory(address)); + } + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + + // One from each other cache (n-1) plus the memory (+1) + tbe.NumPendingMsgs := machineCount(MachineType:L1Cache); + } + } + + action(b_issueGETXIfMoreThanOne, "bo", desc="Issue GETX") { + if (machineCount(MachineType:L1Cache) > 1) { + enqueue(requestNetwork_out, RequestMsg, issue_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceRequestType:GETX; + out_msg.Requestor := machineID; + if (is_gpu) { + DPRINTF(RubySlicc, "Setting up the broadcast\n"); + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + } else { + out_msg.Destination.add(map_Address_to_Directory(address)); + } + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + // One from each other cache (n-1) plus the memory (+1) + tbe.NumPendingMsgs := machineCount(MachineType:L1Cache); + } + + action(bf_issueGETF, "bf", desc="Issue GETF") { + enqueue(requestNetwork_out, RequestMsg, issue_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceRequestType:GETF; + out_msg.Requestor := machineID; + if (is_gpu) { + DPRINTF(RubySlicc, "Setting up the broadcast\n"); + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + } else { + out_msg.Destination.add(map_Address_to_Directory(address)); + } + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + + // One from each other cache (n-1) plus the memory (+1) + tbe.NumPendingMsgs := machineCount(MachineType:L1Cache); + } + } + + action(c_sendExclusiveData, "c", desc="Send exclusive data from cache to requestor") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(cache_entry)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.OriginalDestination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Dirty := cache_entry.Dirty; + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache); + } else { + out_msg.Acks := 2; + } + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(ct_sendExclusiveDataFromTBE, "ct", desc="Send exclusive data from tbe to requestor") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.OriginalDestination.add(in_msg.Requestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache); + } else { + out_msg.Acks := 2; + } + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(d_issuePUT, "d", desc="Issue PUT") { + enqueue(requestNetwork_out, RequestMsg, issue_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceRequestType:PUT; + out_msg.Requestor := machineID; + if (is_gpu) { + DPRINTF(RubySlicc, "Setting up the broadcast\n"); + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + } else { + out_msg.Destination.add(map_Address_to_Directory(address)); + } + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + + action(df_issuePUTF, "df", desc="Issue PUTF") { + enqueue(requestNetwork_out, RequestMsg, issue_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceRequestType:PUTF; + out_msg.Requestor := machineID; + if (is_gpu) { + DPRINTF(RubySlicc, "Setting up the broadcast\n"); + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + } else { + out_msg.Destination.add(map_Address_to_Directory(address)); + } + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + + action(e_sendData, "e", desc="Send data from cache to requestor") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(cache_entry)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.OriginalDestination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Dirty := cache_entry.Dirty; + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache); + } else { + out_msg.Acks := 2; + } + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(ee_sendDataShared, "\e", desc="Send data from cache to requestor, remaining the owner") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(cache_entry)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA_SHARED; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.OriginalDestination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Dirty := cache_entry.Dirty; + DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk); + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache); + } else { + out_msg.Acks := 2; + } + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(et_sendDataSharedFromTBE, "\et", desc="Send data from TBE to requestor, keep a shared copy") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA_SHARED; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.OriginalDestination.add(in_msg.Requestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk); + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache); + } else { + out_msg.Acks := 2; + } + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(em_sendDataSharedMultiple, "em", desc="Send data from cache to all requestors, still the owner") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(cache_entry)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA_SHARED; + out_msg.Sender := machineID; + out_msg.Destination := in_msg.MergedRequestors; + out_msg.OriginalDestination := in_msg.MergedRequestors; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Dirty := cache_entry.Dirty; + DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk); + out_msg.Acks := machineCount(MachineType:L1Cache); + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(emt_sendDataSharedMultipleFromTBE, "emt", desc="Send data from tbe to all requestors") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA_SHARED; + out_msg.Sender := machineID; + out_msg.Destination := in_msg.MergedRequestors; + out_msg.OriginalDestination := in_msg.MergedRequestors; + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk); + out_msg.Acks := machineCount(MachineType:L1Cache); + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(f_sendAck, "f", desc="Send ack from cache to requestor") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:ACK; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.OriginalDestination.add(in_msg.Requestor); + out_msg.Acks := 1; + out_msg.SilentAcks := in_msg.SilentAcks; + assert(in_msg.DirectedProbe == false); + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(ff_sendAckShared, "\f", desc="Send shared ack from cache to requestor") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:ACK_SHARED; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.OriginalDestination.add(in_msg.Requestor); + out_msg.Acks := 1; + out_msg.SilentAcks := in_msg.SilentAcks; + assert(in_msg.DirectedProbe == false); + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(g_sendUnblock, "g", desc="Send unblock to memory") { + enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:UNBLOCK; + out_msg.Sender := machineID; + if (is_gpu) { + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + } else { + out_msg.Destination.add(map_Address_to_Directory(address)); + } + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + } + } + + action(gm_sendUnblockM, "gm", desc="Send unblock to memory and indicate M/O/E state") { + enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:UNBLOCKM; + out_msg.Sender := machineID; + if (is_gpu) { + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + } else { + out_msg.Destination.add(map_Address_to_Directory(address)); + } + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + } + } + + action(gs_sendUnblockS, "gs", desc="Send unblock to memory and indicate S state") { + enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:UNBLOCKS; + out_msg.Sender := machineID; + out_msg.CurOwner := tbe.CurOwner; + if (is_gpu) { + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + } else { + out_msg.Destination.add(map_Address_to_Directory(address)); + } + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + } + } + + action(h_load_hit, "h", desc="Notify sequencer the load completed.") { + assert(is_valid(cache_entry)); + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + sequencer.readCallback(address, cache_entry.DataBlk, false, + testAndClearLocalHit(cache_entry)); + } + + action(hx_external_load_hit, "hx", desc="load required external msgs") { + assert(is_valid(cache_entry)); + assert(is_valid(tbe)); + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + peek(responseToCache_in, ResponseMsg) { + + sequencer.readCallback(address, cache_entry.DataBlk, true, + machineIDToMachineType(in_msg.Sender), tbe.InitialRequestTime, + tbe.ForwardRequestTime, tbe.FirstResponseTime); + } + } + + action(hh_store_hit, "\h", desc="Notify sequencer that store completed.") { + assert(is_valid(cache_entry)); + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + peek(mandatoryQueue_in, RubyRequest) { + sequencer.writeCallback(address, cache_entry.DataBlk, false, + testAndClearLocalHit(cache_entry)); + + cache_entry.Dirty := true; + if (in_msg.Type == RubyRequestType:ATOMIC) { + cache_entry.AtomicAccessed := true; + } + } + } + + action(hh_flush_hit, "\hf", desc="Notify sequencer that flush completed.") { + assert(is_valid(tbe)); + DPRINTF(RubySlicc, "%s\n", tbe.DataBlk); + sequencer.writeCallback(address, tbe.DataBlk, false, MachineType:L1Cache); + } + + action(sx_external_store_hit, "sx", desc="store required external msgs.") { + assert(is_valid(cache_entry)); + assert(is_valid(tbe)); + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + peek(responseToCache_in, ResponseMsg) { + + sequencer.writeCallback(address, cache_entry.DataBlk, true, + machineIDToMachineType(in_msg.Sender), tbe.InitialRequestTime, + tbe.ForwardRequestTime, tbe.FirstResponseTime); + } + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + cache_entry.Dirty := true; + } + + action(sxt_trig_ext_store_hit, "sxt", desc="store required external msgs.") { + assert(is_valid(cache_entry)); + assert(is_valid(tbe)); + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + + sequencer.writeCallback(address, cache_entry.DataBlk, true, + machineIDToMachineType(tbe.LastResponder), tbe.InitialRequestTime, + tbe.ForwardRequestTime, tbe.FirstResponseTime); + + cache_entry.Dirty := true; + } + + action(i_allocateTBE, "i", desc="Allocate TBE") { + check_allocate(TBEs); + assert(is_valid(cache_entry)); + TBEs.allocate(address); + set_tbe(TBEs[address]); + tbe.DataBlk := cache_entry.DataBlk; // Data only used for writebacks + tbe.Dirty := cache_entry.Dirty; + tbe.Sharers := false; + } + + action(it_allocateTBE, "it", desc="Allocate TBE") { + check_allocate(TBEs); + TBEs.allocate(address); + set_tbe(TBEs[address]); + tbe.Dirty := false; + tbe.Sharers := false; + } + + action(j_popTriggerQueue, "j", desc="Pop trigger queue.") { + triggerQueue_in.dequeue(); + } + + action(k_popMandatoryQueue, "k", desc="Pop mandatory queue.") { + mandatoryQueue_in.dequeue(); + } + + action(l_popForwardQueue, "l", desc="Pop forwareded request queue.") { + forwardToCache_in.dequeue(); + } + + action(hp_copyFromTBEToL2, "li", desc="Copy data from TBE to L2 cache entry.") { + assert(is_valid(cache_entry)); + assert(is_valid(tbe)); + cache_entry.Dirty := tbe.Dirty; + cache_entry.DataBlk := tbe.DataBlk; + } + + action(nb_copyFromTBEToL1, "fu", desc="Copy data from TBE to L1 cache entry.") { + assert(is_valid(cache_entry)); + assert(is_valid(tbe)); + cache_entry.Dirty := tbe.Dirty; + cache_entry.DataBlk := tbe.DataBlk; + cache_entry.FromL2 := true; + } + + action(m_decrementNumberOfMessages, "m", desc="Decrement the number of messages for which we're waiting") { + peek(responseToCache_in, ResponseMsg) { + assert(in_msg.Acks >= 0); + assert(is_valid(tbe)); + DPRINTF(RubySlicc, "Sender = %s\n", in_msg.Sender); + DPRINTF(RubySlicc, "SilentAcks = %d\n", in_msg.SilentAcks); + if (tbe.AppliedSilentAcks == false) { + tbe.NumPendingMsgs := tbe.NumPendingMsgs - in_msg.SilentAcks; + tbe.AppliedSilentAcks := true; + } + DPRINTF(RubySlicc, "%d\n", tbe.NumPendingMsgs); + tbe.NumPendingMsgs := tbe.NumPendingMsgs - in_msg.Acks; + DPRINTF(RubySlicc, "%d\n", tbe.NumPendingMsgs); + APPEND_TRANSITION_COMMENT(tbe.NumPendingMsgs); + APPEND_TRANSITION_COMMENT(in_msg.Sender); + tbe.LastResponder := in_msg.Sender; + if (tbe.InitialRequestTime != zero_time() && in_msg.InitialRequestTime != zero_time()) { + assert(tbe.InitialRequestTime == in_msg.InitialRequestTime); + } + if (in_msg.InitialRequestTime != zero_time()) { + tbe.InitialRequestTime := in_msg.InitialRequestTime; + } + if (tbe.ForwardRequestTime != zero_time() && in_msg.ForwardRequestTime != zero_time()) { + assert(tbe.ForwardRequestTime == in_msg.ForwardRequestTime); + } + if (in_msg.ForwardRequestTime != zero_time()) { + tbe.ForwardRequestTime := in_msg.ForwardRequestTime; + } + if (tbe.FirstResponseTime == zero_time()) { + tbe.FirstResponseTime := curCycle(); + } + } + } + action(uo_updateCurrentOwner, "uo", desc="When moving SS state, update current owner.") { + peek(responseToCache_in, ResponseMsg) { + assert(is_valid(tbe)); + tbe.CurOwner := in_msg.Sender; + } + } + + action(n_popResponseQueue, "n", desc="Pop response queue") { + responseToCache_in.dequeue(); + } + + action(ll_L2toL1Transfer, "ll", desc="") { + enqueue(triggerQueue_out, TriggerMsg, l2_cache_hit_latency) { + out_msg.Addr := address; + out_msg.Type := TriggerType:L2_to_L1; + } + } + + action(o_checkForCompletion, "o", desc="Check if we have received all the messages required for completion") { + assert(is_valid(tbe)); + if (tbe.NumPendingMsgs == 0) { + enqueue(triggerQueue_out, TriggerMsg) { + out_msg.Addr := address; + if (tbe.Sharers) { + out_msg.Type := TriggerType:ALL_ACKS; + } else { + out_msg.Type := TriggerType:ALL_ACKS_NO_SHARERS; + } + } + } + } + + action(p_decrementNumberOfMessagesByOne, "p", desc="Decrement the number of messages for which we're waiting by one") { + assert(is_valid(tbe)); + tbe.NumPendingMsgs := tbe.NumPendingMsgs - 1; + } + + action(pp_incrementNumberOfMessagesByOne, "\p", desc="Increment the number of messages for which we're waiting by one") { + assert(is_valid(tbe)); + tbe.NumPendingMsgs := tbe.NumPendingMsgs + 1; + } + + action(q_sendDataFromTBEToCache, "q", desc="Send data from TBE to cache") { + peek(forwardToCache_in, RequestMsg) { + assert(in_msg.Requestor != machineID); + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.OriginalDestination.add(in_msg.Requestor); + DPRINTF(RubySlicc, "%s\n", out_msg.Destination); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache); + } else { + out_msg.Acks := 2; + } + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(sq_sendSharedDataFromTBEToCache, "sq", desc="Send shared data from TBE to cache, still the owner") { + peek(forwardToCache_in, RequestMsg) { + assert(in_msg.Requestor != machineID); + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA_SHARED; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.OriginalDestination.add(in_msg.Requestor); + DPRINTF(RubySlicc, "%s\n", out_msg.Destination); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache); + } else { + out_msg.Acks := 2; + } + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(qm_sendDataFromTBEToCache, "qm", desc="Send data from TBE to cache, multiple sharers, still the owner") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA_SHARED; + out_msg.Sender := machineID; + out_msg.Destination := in_msg.MergedRequestors; + out_msg.OriginalDestination := in_msg.MergedRequestors; + DPRINTF(RubySlicc, "%s\n", out_msg.Destination); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + out_msg.Acks := machineCount(MachineType:L1Cache); + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(qq_sendDataFromTBEToMemory, "\q", desc="Send data from TBE to memory") { + enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Sender := machineID; + if (is_gpu) { + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + } else { + out_msg.Destination.add(map_Address_to_Directory(address)); + } + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.Dirty := tbe.Dirty; + if (tbe.Dirty) { + out_msg.Type := CoherenceResponseType:WB_DIRTY; + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Writeback_Data; + } else { + out_msg.Type := CoherenceResponseType:WB_CLEAN; + // NOTE: in a real system this would not send data. We send + // data here only so we can check it at the memory + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(r_setSharerBit, "r", desc="We saw other sharers") { + assert(is_valid(tbe)); + tbe.Sharers := true; + } + + action(s_deallocateTBE, "s", desc="Deallocate TBE") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(t_sendExclusiveDataFromTBEToMemory, "t", desc="Send exclusive data from TBE to memory") { + enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Dirty) { + out_msg.Type := CoherenceResponseType:WB_EXCLUSIVE_DIRTY; + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Writeback_Data; + } else { + out_msg.Type := CoherenceResponseType:WB_EXCLUSIVE_CLEAN; + // NOTE: in a real system this would not send data. We send + // data here only so we can check it at the memory + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(u_writeDataToCache, "u", desc="Write data to cache") { + peek(responseToCache_in, ResponseMsg) { + assert(is_valid(cache_entry)); + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(uf_writeDataToCacheTBE, "uf", desc="Write data to TBE") { + peek(responseToCache_in, ResponseMsg) { + assert(is_valid(tbe)); + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + } + } + + action(v_writeDataToCacheVerify, "v", desc="Write data to cache, assert it was same as before") { + peek(responseToCache_in, ResponseMsg) { + assert(is_valid(cache_entry)); + DPRINTF(RubySlicc, "Cached Data Block: %s, Msg Data Block: %s\n", + cache_entry.DataBlk, in_msg.DataBlk); + assert(cache_entry.DataBlk == in_msg.DataBlk); + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty || cache_entry.Dirty; + } + } + + action(vt_writeDataToTBEVerify, "vt", desc="Write data to TBE, assert it was same as before") { + peek(responseToCache_in, ResponseMsg) { + assert(is_valid(tbe)); + DPRINTF(RubySlicc, "Cached Data Block: %s, Msg Data Block: %s\n", + tbe.DataBlk, in_msg.DataBlk); + assert(tbe.DataBlk == in_msg.DataBlk); + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty || tbe.Dirty; + } + } + + action(gg_deallocateL1CacheBlock, "\g", desc="Deallocate cache block. Sets the cache to invalid, allowing a replacement in parallel with a fetch.") { + if (L1Dcache.isTagPresent(address)) { + L1Dcache.deallocate(address); + } else { + L1Icache.deallocate(address); + } + unset_cache_entry(); + } + + action(ii_allocateL1DCacheBlock, "\i", desc="Set L1 D-cache tag equal to tag of block B.") { + if (is_invalid(cache_entry)) { + set_cache_entry(L1Dcache.allocate(address, new Entry)); + } + } + + action(jj_allocateL1ICacheBlock, "\j", desc="Set L1 I-cache tag equal to tag of block B.") { + if (is_invalid(cache_entry)) { + set_cache_entry(L1Icache.allocate(address, new Entry)); + } + } + + action(vv_allocateL2CacheBlock, "\v", desc="Set L2 cache tag equal to tag of block B.") { + set_cache_entry(L2cache.allocate(address, new Entry)); + } + + action(rr_deallocateL2CacheBlock, "\r", desc="Deallocate L2 cache block. Sets the cache to not present, allowing a replacement in parallel with a fetch.") { + L2cache.deallocate(address); + unset_cache_entry(); + } + + action(forward_eviction_to_cpu, "\cc", desc="sends eviction information to the processor") { + if (send_evictions) { + DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address); + sequencer.evictionCallback(address); + } + } + + action(uu_profileL1DataMiss, "\udm", desc="Profile the demand miss") { + ++L1Dcache.demand_misses; + } + + action(uu_profileL1DataHit, "\udh", desc="Profile the demand hits") { + ++L1Dcache.demand_hits; + } + + action(uu_profileL1InstMiss, "\uim", desc="Profile the demand miss") { + ++L1Icache.demand_misses; + } + + action(uu_profileL1InstHit, "\uih", desc="Profile the demand hits") { + ++L1Icache.demand_hits; + } + + action(uu_profileL2Miss, "\um", desc="Profile the demand miss") { + ++L2cache.demand_misses; + } + + action(uu_profileL2Hit, "\uh", desc="Profile the demand hits ") { + ++L2cache.demand_hits; + } + + action(zz_stallAndWaitMandatoryQueue, "\z", desc="Send the head of the mandatory queue to the back of the queue.") { + stall_and_wait(mandatoryQueue_in, address); + } + + action(z_stall, "z", desc="stall") { + // do nothing and the special z_stall action will return a protocol stall + // so that the next port is checked + } + + action(kd_wakeUpDependents, "kd", desc="wake-up dependents") { + wakeUpBuffers(address); + } + + action(ka_wakeUpAllDependents, "ka", desc="wake-up all dependents") { + wakeUpAllBuffers(); + } + + //***************************************************** + // TRANSITIONS + //***************************************************** + + // Transitions for Load/Store/L2_Replacement from transient states + transition({IM, IM_F, MM_WF, SM, SM_F, ISM, ISM_F, OM, OM_F, IS, SS, OI, MI, II, IT, ST, OT, MT, MMT}, {Store, L2_Replacement}) { + zz_stallAndWaitMandatoryQueue; + } + + transition({IM, IM_F, MM_WF, SM, SM_F, ISM, ISM_F, OM, OM_F, IS, SS, OI, MI, II}, {Flush_line}) { + zz_stallAndWaitMandatoryQueue; + } + + transition({M_W, MM_W}, {L2_Replacement, Flush_line}) { + zz_stallAndWaitMandatoryQueue; + } + + transition({IM, IS, OI, MI, II, IT, ST, OT, MT, MMT, MI_F, MM_F, OM_F, IM_F, ISM_F, SM_F, MM_WF}, {Load, Ifetch}) { + zz_stallAndWaitMandatoryQueue; + } + + transition({IM, SM, ISM, OM, IS, SS, MM_W, M_W, OI, MI, II, IT, ST, OT, MT, MMT, IM_F, SM_F, ISM_F, OM_F, MM_WF, MI_F, MM_F, IR, SR, OR, MR, MMR}, L1_to_L2) { + zz_stallAndWaitMandatoryQueue; + } + + transition({MI_F, MM_F}, {Store}) { + zz_stallAndWaitMandatoryQueue; + } + + transition({MM_F, MI_F}, {Flush_line}) { + zz_stallAndWaitMandatoryQueue; + } + + transition({IT, ST, OT, MT, MMT}, {Other_GETX, NC_DMA_GETS, Other_GETS, Merged_GETS, Other_GETS_No_Mig, Invalidate, Flush_line}) { + z_stall; + } + + transition({IR, SR, OR, MR, MMR}, {Other_GETX, NC_DMA_GETS, Other_GETS, Merged_GETS, Other_GETS_No_Mig, Invalidate}) { + z_stall; + } + + // Transitions moving data between the L1 and L2 caches + transition({I, S, O, M, MM}, L1_to_L2) { + i_allocateTBE; + gg_deallocateL1CacheBlock; + vv_allocateL2CacheBlock; + hp_copyFromTBEToL2; + s_deallocateTBE; + } + + transition(I, Trigger_L2_to_L1D, IT) { + i_allocateTBE; + rr_deallocateL2CacheBlock; + ii_allocateL1DCacheBlock; + nb_copyFromTBEToL1; // Not really needed for state I + s_deallocateTBE; + zz_stallAndWaitMandatoryQueue; + ll_L2toL1Transfer; + } + + transition(S, Trigger_L2_to_L1D, ST) { + i_allocateTBE; + rr_deallocateL2CacheBlock; + ii_allocateL1DCacheBlock; + nb_copyFromTBEToL1; + s_deallocateTBE; + zz_stallAndWaitMandatoryQueue; + ll_L2toL1Transfer; + } + + transition(O, Trigger_L2_to_L1D, OT) { + i_allocateTBE; + rr_deallocateL2CacheBlock; + ii_allocateL1DCacheBlock; + nb_copyFromTBEToL1; + s_deallocateTBE; + zz_stallAndWaitMandatoryQueue; + ll_L2toL1Transfer; + } + + transition(M, Trigger_L2_to_L1D, MT) { + i_allocateTBE; + rr_deallocateL2CacheBlock; + ii_allocateL1DCacheBlock; + nb_copyFromTBEToL1; + s_deallocateTBE; + zz_stallAndWaitMandatoryQueue; + ll_L2toL1Transfer; + } + + transition(MM, Trigger_L2_to_L1D, MMT) { + i_allocateTBE; + rr_deallocateL2CacheBlock; + ii_allocateL1DCacheBlock; + nb_copyFromTBEToL1; + s_deallocateTBE; + zz_stallAndWaitMandatoryQueue; + ll_L2toL1Transfer; + } + + transition(I, Trigger_L2_to_L1I, IT) { + i_allocateTBE; + rr_deallocateL2CacheBlock; + jj_allocateL1ICacheBlock; + nb_copyFromTBEToL1; + s_deallocateTBE; + zz_stallAndWaitMandatoryQueue; + ll_L2toL1Transfer; + } + + transition(S, Trigger_L2_to_L1I, ST) { + i_allocateTBE; + rr_deallocateL2CacheBlock; + jj_allocateL1ICacheBlock; + nb_copyFromTBEToL1; + s_deallocateTBE; + zz_stallAndWaitMandatoryQueue; + ll_L2toL1Transfer; + } + + transition(O, Trigger_L2_to_L1I, OT) { + i_allocateTBE; + rr_deallocateL2CacheBlock; + jj_allocateL1ICacheBlock; + nb_copyFromTBEToL1; + s_deallocateTBE; + zz_stallAndWaitMandatoryQueue; + ll_L2toL1Transfer; + } + + transition(M, Trigger_L2_to_L1I, MT) { + i_allocateTBE; + rr_deallocateL2CacheBlock; + jj_allocateL1ICacheBlock; + nb_copyFromTBEToL1; + s_deallocateTBE; + zz_stallAndWaitMandatoryQueue; + ll_L2toL1Transfer; + } + + transition(MM, Trigger_L2_to_L1I, MMT) { + i_allocateTBE; + rr_deallocateL2CacheBlock; + jj_allocateL1ICacheBlock; + nb_copyFromTBEToL1; + s_deallocateTBE; + zz_stallAndWaitMandatoryQueue; + ll_L2toL1Transfer; + } + + transition(IT, Complete_L2_to_L1, IR) { + j_popTriggerQueue; + kd_wakeUpDependents; + } + + transition(ST, Complete_L2_to_L1, SR) { + j_popTriggerQueue; + kd_wakeUpDependents; + } + + transition(OT, Complete_L2_to_L1, OR) { + j_popTriggerQueue; + kd_wakeUpDependents; + } + + transition(MT, Complete_L2_to_L1, MR) { + j_popTriggerQueue; + kd_wakeUpDependents; + } + + transition(MMT, Complete_L2_to_L1, MMR) { + j_popTriggerQueue; + kd_wakeUpDependents; + } + + // Transitions from Idle + transition({I,IR}, Load, IS) { + ii_allocateL1DCacheBlock; + i_allocateTBE; + a_issueGETS; + uu_profileL1DataMiss; + uu_profileL2Miss; + k_popMandatoryQueue; + } + + transition({I,IR}, Ifetch, IS) { + jj_allocateL1ICacheBlock; + i_allocateTBE; + a_issueGETS; + uu_profileL1InstMiss; + uu_profileL2Miss; + k_popMandatoryQueue; + } + + transition({I,IR}, Store, IM) { + ii_allocateL1DCacheBlock; + i_allocateTBE; + b_issueGETX; + uu_profileL1DataMiss; + uu_profileL2Miss; + k_popMandatoryQueue; + } + + transition({I, IR}, Flush_line, IM_F) { + it_allocateTBE; + bf_issueGETF; + k_popMandatoryQueue; + } + + transition(I, L2_Replacement) { + rr_deallocateL2CacheBlock; + ka_wakeUpAllDependents; + } + + transition(I, {Other_GETX, NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig, Invalidate}) { + f_sendAck; + l_popForwardQueue; + } + + // Transitions from Shared + transition({S, SM, ISM}, Load) { + h_load_hit; + uu_profileL1DataHit; + k_popMandatoryQueue; + } + + transition({S, SM, ISM}, Ifetch) { + h_load_hit; + uu_profileL1InstHit; + k_popMandatoryQueue; + } + + transition(SR, Load, S) { + h_load_hit; + uu_profileL1DataMiss; + uu_profileL2Hit; + k_popMandatoryQueue; + ka_wakeUpAllDependents; + } + + transition(SR, Ifetch, S) { + h_load_hit; + uu_profileL1InstMiss; + uu_profileL2Hit; + k_popMandatoryQueue; + ka_wakeUpAllDependents; + } + + transition({S,SR}, Store, SM) { + i_allocateTBE; + b_issueGETX; + uu_profileL1DataMiss; + uu_profileL2Miss; + k_popMandatoryQueue; + } + + transition({S, SR}, Flush_line, SM_F) { + i_allocateTBE; + bf_issueGETF; + forward_eviction_to_cpu; + gg_deallocateL1CacheBlock; + k_popMandatoryQueue; + } + + transition(S, L2_Replacement, I) { + forward_eviction_to_cpu; + rr_deallocateL2CacheBlock; + ka_wakeUpAllDependents; + } + + transition(S, {Other_GETX, Invalidate}, I) { + f_sendAck; + forward_eviction_to_cpu; + l_popForwardQueue; + } + + transition(S, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}) { + ff_sendAckShared; + l_popForwardQueue; + } + + // Transitions from Owned + transition({O, OM, SS, MM_W, M_W}, {Load}) { + h_load_hit; + uu_profileL1DataHit; + k_popMandatoryQueue; + } + + transition({O, OM, SS, MM_W, M_W}, {Ifetch}) { + h_load_hit; + uu_profileL1InstHit; + k_popMandatoryQueue; + } + + transition(OR, Load, O) { + h_load_hit; + uu_profileL1DataMiss; + uu_profileL2Hit; + k_popMandatoryQueue; + ka_wakeUpAllDependents; + } + + transition(OR, Ifetch, O) { + h_load_hit; + uu_profileL1InstMiss; + uu_profileL2Hit; + k_popMandatoryQueue; + ka_wakeUpAllDependents; + } + + transition({O,OR}, Store, OM) { + i_allocateTBE; + b_issueGETX; + p_decrementNumberOfMessagesByOne; + uu_profileL1DataMiss; + uu_profileL2Miss; + k_popMandatoryQueue; + } + + transition({O, OR}, Flush_line, OM_F) { + i_allocateTBE; + bf_issueGETF; + p_decrementNumberOfMessagesByOne; + forward_eviction_to_cpu; + gg_deallocateL1CacheBlock; + k_popMandatoryQueue; + } + + transition(O, L2_Replacement, OI) { + i_allocateTBE; + d_issuePUT; + forward_eviction_to_cpu; + rr_deallocateL2CacheBlock; + ka_wakeUpAllDependents; + } + + transition(O, {Other_GETX, Invalidate}, I) { + e_sendData; + forward_eviction_to_cpu; + l_popForwardQueue; + } + + transition(O, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}) { + ee_sendDataShared; + l_popForwardQueue; + } + + transition(O, Merged_GETS) { + em_sendDataSharedMultiple; + l_popForwardQueue; + } + + // Transitions from Modified + transition({MM, M}, {Ifetch}) { + h_load_hit; + uu_profileL1InstHit; + k_popMandatoryQueue; + } + + transition({MM, M}, {Load}) { + h_load_hit; + uu_profileL1DataHit; + k_popMandatoryQueue; + } + + transition(MM, Store) { + hh_store_hit; + uu_profileL1DataHit; + k_popMandatoryQueue; + } + + transition(MMR, Load, MM) { + h_load_hit; + uu_profileL1DataMiss; + uu_profileL2Hit; + k_popMandatoryQueue; + ka_wakeUpAllDependents; + } + + transition(MMR, Ifetch, MM) { + h_load_hit; + uu_profileL1InstMiss; + uu_profileL2Hit; + k_popMandatoryQueue; + ka_wakeUpAllDependents; + } + + transition(MMR, Store, MM) { + hh_store_hit; + uu_profileL1DataMiss; + uu_profileL2Hit; + k_popMandatoryQueue; + ka_wakeUpAllDependents; + } + + transition({MM, M, MMR, MR}, Flush_line, MM_F) { + i_allocateTBE; + bf_issueGETF; + p_decrementNumberOfMessagesByOne; + forward_eviction_to_cpu; + gg_deallocateL1CacheBlock; + k_popMandatoryQueue; + } + + transition(MM_F, Block_Ack, MI_F) { + df_issuePUTF; + l_popForwardQueue; + kd_wakeUpDependents; + } + + transition(MM, L2_Replacement, MI) { + i_allocateTBE; + d_issuePUT; + forward_eviction_to_cpu; + rr_deallocateL2CacheBlock; + ka_wakeUpAllDependents; + } + + transition(MM, {Other_GETX, Invalidate}, I) { + c_sendExclusiveData; + forward_eviction_to_cpu; + l_popForwardQueue; + } + + transition(MM, Other_GETS, I) { + c_sendExclusiveData; + forward_eviction_to_cpu; + l_popForwardQueue; + } + + transition(MM, NC_DMA_GETS, O) { + ee_sendDataShared; + l_popForwardQueue; + } + + transition(MM, Other_GETS_No_Mig, O) { + ee_sendDataShared; + l_popForwardQueue; + } + + transition(MM, Merged_GETS, O) { + em_sendDataSharedMultiple; + l_popForwardQueue; + } + + // Transitions from Dirty Exclusive + transition(M, Store, MM) { + hh_store_hit; + uu_profileL1DataHit; + k_popMandatoryQueue; + } + + transition(MR, Load, M) { + h_load_hit; + uu_profileL1DataMiss; + uu_profileL2Hit; + k_popMandatoryQueue; + ka_wakeUpAllDependents; + } + + transition(MR, Ifetch, M) { + h_load_hit; + uu_profileL1InstMiss; + uu_profileL2Hit; + k_popMandatoryQueue; + ka_wakeUpAllDependents; + } + + transition(MR, Store, MM) { + hh_store_hit; + uu_profileL1DataMiss; + uu_profileL2Hit; + k_popMandatoryQueue; + ka_wakeUpAllDependents; + } + + transition(M, L2_Replacement, MI) { + i_allocateTBE; + d_issuePUT; + forward_eviction_to_cpu; + rr_deallocateL2CacheBlock; + ka_wakeUpAllDependents; + } + + transition(M, {Other_GETX, Invalidate}, I) { + c_sendExclusiveData; + forward_eviction_to_cpu; + l_popForwardQueue; + } + + transition(M, {Other_GETS, Other_GETS_No_Mig}, O) { + ee_sendDataShared; + l_popForwardQueue; + } + + transition(M, NC_DMA_GETS, O) { + ee_sendDataShared; + l_popForwardQueue; + } + + transition(M, Merged_GETS, O) { + em_sendDataSharedMultiple; + l_popForwardQueue; + } + + // Transitions from IM + + transition({IM, IM_F}, {Other_GETX, NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig, Invalidate}) { + f_sendAck; + l_popForwardQueue; + } + + transition({IM, IM_F, MM_F}, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(IM, Data, ISM) { + u_writeDataToCache; + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(IM_F, Data, ISM_F) { + uf_writeDataToCacheTBE; + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(IM, Exclusive_Data, MM_W) { + u_writeDataToCache; + m_decrementNumberOfMessages; + o_checkForCompletion; + sx_external_store_hit; + n_popResponseQueue; + kd_wakeUpDependents; + } + + transition(IM_F, Exclusive_Data, MM_WF) { + uf_writeDataToCacheTBE; + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + // Transitions from SM + transition({SM, SM_F}, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}) { + ff_sendAckShared; + l_popForwardQueue; + } + + transition(SM, {Other_GETX, Invalidate}, IM) { + f_sendAck; + forward_eviction_to_cpu; + l_popForwardQueue; + } + + transition(SM_F, {Other_GETX, Invalidate}, IM_F) { + f_sendAck; + forward_eviction_to_cpu; + l_popForwardQueue; + } + + transition({SM, SM_F}, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(SM, {Data, Exclusive_Data}, ISM) { + v_writeDataToCacheVerify; + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(SM_F, {Data, Exclusive_Data}, ISM_F) { + vt_writeDataToTBEVerify; + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + // Transitions from ISM + transition({ISM, ISM_F}, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(ISM, All_acks_no_sharers, MM) { + sxt_trig_ext_store_hit; + gm_sendUnblockM; + s_deallocateTBE; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + transition(ISM_F, All_acks_no_sharers, MI_F) { + df_issuePUTF; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + // Transitions from OM + + transition(OM, {Other_GETX, Invalidate}, IM) { + e_sendData; + pp_incrementNumberOfMessagesByOne; + forward_eviction_to_cpu; + l_popForwardQueue; + } + + transition(OM_F, {Other_GETX, Invalidate}, IM_F) { + q_sendDataFromTBEToCache; + pp_incrementNumberOfMessagesByOne; + forward_eviction_to_cpu; + l_popForwardQueue; + } + + transition(OM, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}) { + ee_sendDataShared; + l_popForwardQueue; + } + + transition(OM, Merged_GETS) { + em_sendDataSharedMultiple; + l_popForwardQueue; + } + + transition(OM_F, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}) { + et_sendDataSharedFromTBE; + l_popForwardQueue; + } + + transition(OM_F, Merged_GETS) { + emt_sendDataSharedMultipleFromTBE; + l_popForwardQueue; + } + + transition({OM, OM_F}, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(OM, {All_acks, All_acks_no_sharers}, MM) { + sxt_trig_ext_store_hit; + gm_sendUnblockM; + s_deallocateTBE; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + transition({MM_F, OM_F}, {All_acks, All_acks_no_sharers}, MI_F) { + df_issuePUTF; + j_popTriggerQueue; + kd_wakeUpDependents; + } + // Transitions from IS + + transition(IS, {Other_GETX, NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig, Invalidate}) { + f_sendAck; + l_popForwardQueue; + } + + transition(IS, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(IS, Shared_Ack) { + m_decrementNumberOfMessages; + r_setSharerBit; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(IS, Data, SS) { + u_writeDataToCache; + m_decrementNumberOfMessages; + o_checkForCompletion; + hx_external_load_hit; + uo_updateCurrentOwner; + n_popResponseQueue; + kd_wakeUpDependents; + } + + transition(IS, Exclusive_Data, M_W) { + u_writeDataToCache; + m_decrementNumberOfMessages; + o_checkForCompletion; + hx_external_load_hit; + n_popResponseQueue; + kd_wakeUpDependents; + } + + transition(IS, Shared_Data, SS) { + u_writeDataToCache; + r_setSharerBit; + m_decrementNumberOfMessages; + o_checkForCompletion; + hx_external_load_hit; + uo_updateCurrentOwner; + n_popResponseQueue; + kd_wakeUpDependents; + } + + // Transitions from SS + + transition(SS, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(SS, Shared_Ack) { + m_decrementNumberOfMessages; + r_setSharerBit; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(SS, All_acks, S) { + gs_sendUnblockS; + s_deallocateTBE; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + transition(SS, All_acks_no_sharers, S) { + // Note: The directory might still be the owner, so that is why we go to S + gs_sendUnblockS; + s_deallocateTBE; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + // Transitions from MM_W + + transition(MM_W, Store) { + hh_store_hit; + uu_profileL1DataHit; + k_popMandatoryQueue; + } + + transition({MM_W, MM_WF}, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(MM_W, All_acks_no_sharers, MM) { + gm_sendUnblockM; + s_deallocateTBE; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + transition(MM_WF, All_acks_no_sharers, MI_F) { + df_issuePUTF; + j_popTriggerQueue; + kd_wakeUpDependents; + } + // Transitions from M_W + + transition(M_W, Store, MM_W) { + hh_store_hit; + uu_profileL1DataHit; + k_popMandatoryQueue; + } + + transition(M_W, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(M_W, All_acks_no_sharers, M) { + gm_sendUnblockM; + s_deallocateTBE; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + // Transitions from OI/MI + + transition({OI, MI}, {Other_GETX, Invalidate}, II) { + q_sendDataFromTBEToCache; + l_popForwardQueue; + } + + transition({OI, MI}, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig}, OI) { + sq_sendSharedDataFromTBEToCache; + l_popForwardQueue; + } + + transition({OI, MI}, Merged_GETS, OI) { + qm_sendDataFromTBEToCache; + l_popForwardQueue; + } + + transition(MI, Writeback_Ack, I) { + t_sendExclusiveDataFromTBEToMemory; + s_deallocateTBE; + l_popForwardQueue; + kd_wakeUpDependents; + } + + transition(MI_F, Writeback_Ack, I) { + hh_flush_hit; + t_sendExclusiveDataFromTBEToMemory; + s_deallocateTBE; + l_popForwardQueue; + kd_wakeUpDependents; + } + + transition(OI, Writeback_Ack, I) { + qq_sendDataFromTBEToMemory; + s_deallocateTBE; + l_popForwardQueue; + kd_wakeUpDependents; + } + + // Transitions from II + transition(II, {NC_DMA_GETS, Other_GETS, Other_GETS_No_Mig, Other_GETX, Invalidate}, II) { + f_sendAck; + l_popForwardQueue; + } + + transition(II, Writeback_Ack, I) { + g_sendUnblock; + s_deallocateTBE; + l_popForwardQueue; + kd_wakeUpDependents; + } + + transition(II, Writeback_Nack, I) { + s_deallocateTBE; + l_popForwardQueue; + kd_wakeUpDependents; + } + + transition(MM_F, {Other_GETX, Invalidate}, IM_F) { + ct_sendExclusiveDataFromTBE; + pp_incrementNumberOfMessagesByOne; + l_popForwardQueue; + } + + transition(MM_F, Other_GETS, IM_F) { + ct_sendExclusiveDataFromTBE; + pp_incrementNumberOfMessagesByOne; + l_popForwardQueue; + } + + transition(MM_F, NC_DMA_GETS, OM_F) { + sq_sendSharedDataFromTBEToCache; + l_popForwardQueue; + } + + transition(MM_F, Other_GETS_No_Mig, OM_F) { + et_sendDataSharedFromTBE; + l_popForwardQueue; + } + + transition(MM_F, Merged_GETS, OM_F) { + emt_sendDataSharedMultipleFromTBE; + l_popForwardQueue; + } +} diff -r 3ee9d80f490f -r 7b001aa001f0 src/mem/protocol/MOESI_hammer_bcu-BCU.sm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/mem/protocol/MOESI_hammer_bcu-BCU.sm Wed Dec 02 17:08:48 2015 -0600 @@ -0,0 +1,252 @@ + + + +machine(BorderControlUnit, "Border control unit") +: Cycles latency := 1; + + // NOTE: I'm pretty sure the directory to cache side is not necessary. + // But we'll just never send messages to this controller over these + // networks. + + // Interface with the cache (copied from MOESI_hammer-dir.sm) + // MessageBuffer * forwardFromDirToCache, network="To", virtual_network="3", ordered="false", vnet_type="forward"; + // MessageBuffer * responseFromDirToCache, network="To", virtual_network="4", ordered="false", vnet_type="response"; + + MessageBuffer * unblockToDirFromCache, network="From", virtual_network="5", ordered="false", vnet_type="unblock"; + MessageBuffer * responseToDirFromCache, network="From", virtual_network="4", ordered="false", vnet_type="response"; + MessageBuffer * requestToDirFromCache, network="From", virtual_network="2", ordered="false", vnet_type="request", recycle_latency="1"; + + // Interface with the directory (copied from MOESI_hammer-cache.sm) + MessageBuffer * requestFromCacheToDir, network="To", virtual_network="2", ordered="false", vnet_type="request"; + MessageBuffer * responseFromCacheToDir, network="To", virtual_network="4", ordered="false", vnet_type="response"; + MessageBuffer * unblockFromCacheToDir, network="To", virtual_network="5", ordered="false", vnet_type="unblock"; + + // MessageBuffer * forwardToCacheFromDir, network="From", virtual_network="3", ordered="false", vnet_type="forward"; + // MessageBuffer * responseToCacheFromDir, network="From", virtual_network="4", ordered="false", vnet_type="response"; +{ + + state_declaration(State, desc="Cache states") { + I, AccessPermission:Invalid, desc="Idle"; + } + + enumeration(Event, desc="BCU events") { + //FwdFromDir; + RespFromCache; + UnblockFromCache; + //ResponseFromDir; + RequestFromCache; + } + + DataBlock blk; + + DataBlock getDataBlock(Address addr), return_by_ref="yes" { + return blk; + } + + AccessPermission getAccessPermission(Address addr) { + return AccessPermission:NotPresent; + } + + void setAccessPermission(Address addr, State state) { + } + + State getState(Address addr) { + return State:I; + } + + void setState(Address addr, State state) { + } + + //out_port(fwdNetwork_out, RequestMsg, forwardFromDirToCache); + //out_port(responseNetworkToCache_out, ResponseMsg, responseFromDirToCache); + out_port(requestNetwork_out, RequestMsg, requestFromCacheToDir); + out_port(responseNetworkToDir_out, ResponseMsg, responseFromCacheToDir); + out_port(unblockNetwork_out, ResponseMsg, unblockFromCacheToDir); + + // Think about if this is the right order!! + + in_port(unblock_in, ResponseMsg, unblockToDirFromCache) { + if (unblock_in.isReady()) { + peek(unblock_in, ResponseMsg) { + trigger(Event:UnblockFromCache, in_msg.Addr); + // NOTE: no need for the entry or the TBE unless we explicitly + // add those things to this controller. (see line 81 of + // InPortDeclAST.py) + } + } + } + + // in_port(respFromDir_in, ResponseMsg, responseToCacheFromDir) { + // if (respFromDir_in.isReady()) { + // peek(respFromDir_in, ResponseMsg) { + // trigger(Event:ResponseFromDir, in_msg.Addr); + // } + // } + // } + + in_port(respFromCache_in, ResponseMsg, responseToDirFromCache) { + if (respFromCache_in.isReady()) { + peek(respFromCache_in, ResponseMsg) { + trigger(Event:RespFromCache, in_msg.Addr); + } + } + } + + in_port(requestFromCache_in, RequestMsg, requestToDirFromCache) { + if (requestFromCache_in.isReady()) { + peek(requestFromCache_in, RequestMsg) { + trigger(Event:RequestFromCache, in_msg.Addr); + } + } + } + + // in_port(fwdFromDir_in, ResponseMsg, forwardToCacheFromDir) { + // if (fwdFromDir_in.isReady()) { + // peek(fwdFromDir_in, ResponseMsg) { + // trigger(Event:FwdFromDir, in_msg.Addr); + // } + // } + // } + + // action(ff_fwdtocache, "ff", desc="FwdFromDir") { + // peek(fwdFromDir_in, ResponseMsg) { + // enqueue(fwdNetwork_out, ResponseMsg, latency) { + // out_msg.Addr := in_msg.Addr; + // out_msg.Type := in_msg.Type; + // out_msg.Sender := in_msg.Sender; + // out_msg.CurOwner := in_msg.CurOwner; + // out_msg.Destination := in_msg.OriginalDestination; + // out_msg.DataBlk := in_msg.DataBlk; + // out_msg.Dirty := in_msg.Dirty; + // out_msg.Acks := in_msg.Acks; + // out_msg.MessageSize := in_msg.MessageSize; + // out_msg.InitialRequestTime := in_msg.InitialRequestTime; + // out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + // out_msg.SilentAcks := in_msg.SilentAcks; + // } + // } + // } + + action(rsd_resptodir, "rtd", desc="RespFromCache") { + peek(respFromCache_in, ResponseMsg) { + DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr); + enqueue(responseNetworkToDir_out, ResponseMsg, latency) { + out_msg.Addr := in_msg.Addr; + out_msg.Type := in_msg.Type; + out_msg.Sender := in_msg.Sender; + out_msg.CurOwner := in_msg.CurOwner; + out_msg.Destination := in_msg.OriginalDestination; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := in_msg.Dirty; + out_msg.Acks := in_msg.Acks; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + out_msg.SilentAcks := in_msg.SilentAcks; + } + } + } + + // action(rsc_resptocache, "rsc", desc="ResponseFromDir") { + // peek(respFromDir_in, ResponseMsg) { + // enqueue(responseNetworkToCache_out, ResponseMsg, latency) { + // out_msg.Addr := in_msg.Addr; + // out_msg.Type := in_msg.Type; + // out_msg.Sender := in_msg.Sender; + // out_msg.CurOwner := in_msg.CurOwner; + // out_msg.Destination := in_msg.OriginalDestination; + // out_msg.DataBlk := in_msg.DataBlk; + // out_msg.Dirty := in_msg.Dirty; + // out_msg.Acks := in_msg.Acks; + // out_msg.MessageSize := in_msg.MessageSize; + // out_msg.InitialRequestTime := in_msg.InitialRequestTime; + // out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + // out_msg.SilentAcks := in_msg.SilentAcks; + // } + // } + // } + + action(uc_unblocktodir, "uc", desc="UnblockFromCache") { + peek(unblock_in, ResponseMsg) { + DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr); + enqueue(unblockNetwork_out, ResponseMsg, latency) { + out_msg.Addr := in_msg.Addr; + out_msg.Type := in_msg.Type; + out_msg.Sender := in_msg.Sender; + out_msg.CurOwner := in_msg.CurOwner; + out_msg.Destination := in_msg.OriginalDestination; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := in_msg.Dirty; + out_msg.Acks := in_msg.Acks; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + out_msg.SilentAcks := in_msg.SilentAcks; + } + } + } + + action(rqc_reqtodir, "rqc", desc="RequestFromCache") { + peek(requestFromCache_in, RequestMsg) { + DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr); + enqueue(requestNetwork_out, RequestMsg, latency) { + out_msg.Addr := in_msg.Addr; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.MergedRequestors := in_msg.MergedRequestors; + out_msg.Destination := in_msg.OriginalDestination; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.DirectedProbe := in_msg.DirectedProbe; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + out_msg.SilentAcks := in_msg.SilentAcks; + } + } + } + + action(pu_popunblock, "pu", desc="") { + unblock_in.dequeue(); + } + + // action(prfd_poprespFromDir, "prfd", desc="") { + // respFromDir_in.dequeue(); + // } + + action(prfc_poprespFromCache, "prfc", desc="") { + respFromCache_in.dequeue(); + } + + action(pq_popreqFromCache, "pq", desc="") { + requestFromCache_in.dequeue(); + } + + // action(pf_popfwdFromDir, "pf", desc="") { + // fwdFromDir_in.dequeue(); + // } + + + // transition({I}, {FwdFromDir}) { + // ff_fwdtocache; + // pf_popfwdFromDir; + // } + + transition({I}, {RespFromCache}) { + rsd_resptodir; + prfc_poprespFromCache; + } + + transition({I}, {UnblockFromCache}) { + uc_unblocktodir; + pu_popunblock; + } + + // transition({I}, {ResponseFromDir}) { + // rsc_resptocache; + // prfd_poprespFromDir; + // } + + transition({I}, {RequestFromCache}) { + rqc_reqtodir; + pq_popreqFromCache; + } +} \ No newline at end of file diff -r 3ee9d80f490f -r 7b001aa001f0 src/mem/protocol/MOESI_hammer_bcu-msg.sm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/mem/protocol/MOESI_hammer_bcu-msg.sm Wed Dec 02 17:08:48 2015 -0600 @@ -0,0 +1,203 @@ +/* + * Copyright (c) 1999-2008 Mark D. Hill and David A. Wood + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * AMD's contributions to the MOESI hammer protocol do not constitute an + * endorsement of its similarity to any AMD products. + */ + +// CoherenceRequestType +enumeration(CoherenceRequestType, desc="...") { + GETX, desc="Get eXclusive"; + GETS, desc="Get Shared"; + MERGED_GETS, desc="Get Shared"; + PUT, desc="Put Ownership"; + WB_ACK, desc="Writeback ack"; + WB_NACK, desc="Writeback neg. ack"; + PUTF, desc="PUT on a Flush"; + GETF, desc="Issue exclusive for Flushing"; + BLOCK_ACK, desc="Dir Block ack"; + INV, desc="Invalidate"; +} + +// CoherenceResponseType +enumeration(CoherenceResponseType, desc="...") { + ACK, desc="ACKnowledgment, responder does not have a copy"; + ACK_SHARED, desc="ACKnowledgment, responder has a shared copy"; + DATA, desc="Data, responder does not have a copy"; + DATA_SHARED, desc="Data, responder has a shared copy"; + DATA_EXCLUSIVE, desc="Data, responder was exclusive, gave us a copy, and they went to invalid"; + WB_CLEAN, desc="Clean writeback"; + WB_DIRTY, desc="Dirty writeback"; + WB_EXCLUSIVE_CLEAN, desc="Clean writeback of exclusive data"; + WB_EXCLUSIVE_DIRTY, desc="Dirty writeback of exclusive data"; + UNBLOCK, desc="Unblock for writeback"; + UNBLOCKS, desc="Unblock now in S"; + UNBLOCKM, desc="Unblock now in M/O/E"; + NULL, desc="Null value"; +} + +// TriggerType +enumeration(TriggerType, desc="...") { + L2_to_L1, desc="L2 to L1 transfer"; + ALL_ACKS, desc="See corresponding event"; + ALL_ACKS_OWNER_EXISTS,desc="See corresponding event"; + ALL_ACKS_NO_SHARERS, desc="See corresponding event"; + ALL_UNBLOCKS, desc="all unblockS received"; +} + +// TriggerMsg +structure(TriggerMsg, desc="...", interface="Message") { + Address Addr, desc="Physical address for this request"; + TriggerType Type, desc="Type of trigger"; + + bool functionalRead(Packet *pkt) { + // Trigger messages do not hold any data! + return false; + } + + bool functionalWrite(Packet *pkt) { + // Trigger messages do not hold any data! + return false; + } +} + +// RequestMsg (and also forwarded requests) +structure(RequestMsg, desc="...", interface="NetworkMessage") { + Address Addr, desc="Physical address for this request"; + CoherenceRequestType Type, desc="Type of request (GetS, GetX, PutX, etc)"; + MachineID Requestor, desc="Node who initiated the request"; + NetDest MergedRequestors, desc="Merge set of read requestors"; + NetDest Destination, desc="Multicast destination mask"; + NetDest OriginalDestination, desc="Multicast destination mask"; + MessageSizeType MessageSize, desc="size category of the message"; + bool DirectedProbe, default="false", desc="probe filter directed probe"; + + Cycles InitialRequestTime, default="Cycles(0)", + desc="time the initial requests was sent from the L1Cache"; + Cycles ForwardRequestTime, default="Cycles(0)", + desc="time the dir forwarded the request"; + int SilentAcks, default="0", desc="silent acks from the full-bit directory"; + + bool functionalRead(Packet *pkt) { + // Request messages do not hold any data + return false; + } + + bool functionalWrite(Packet *pkt) { + // Request messages do not hold any data + return false; + } +} + +// ResponseMsg (and also unblock requests) +structure(ResponseMsg, desc="...", interface="NetworkMessage") { + Address Addr, desc="Physical address for this request"; + CoherenceResponseType Type, desc="Type of response (Ack, Data, etc)"; + MachineID Sender, desc="Node who sent the data"; + MachineID CurOwner, desc="current owner of the block, used for UnblockS responses"; + NetDest Destination, desc="Node to whom the data is sent"; + NetDest OriginalDestination, desc="Multicast destination mask"; + DataBlock DataBlk, desc="data for the cache line"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int Acks, default="0", desc="How many messages this counts as"; + MessageSizeType MessageSize, desc="size category of the message"; + + Cycles InitialRequestTime, default="Cycles(0)", + desc="time the initial requests was sent from the L1Cache"; + Cycles ForwardRequestTime, default="Cycles(0)", + desc="time the dir forwarded the request"; + int SilentAcks, default="0", desc="silent acks from the full-bit directory"; + + bool functionalRead(Packet *pkt) { + // The check below ensures that data is read only from messages that + // actually hold data. + if (Type == CoherenceResponseType:DATA || + Type == CoherenceResponseType:DATA_SHARED || + Type == CoherenceResponseType:DATA_EXCLUSIVE || + Type == CoherenceResponseType:WB_DIRTY || + Type == CoherenceResponseType:WB_EXCLUSIVE_DIRTY) { + return testAndRead(Addr, DataBlk, pkt); + } + + return false; + } + + bool functionalWrite(Packet *pkt) { + // Message type does not matter since all messages are written. + // If a protocol reads data from a packet that is not supposed + // to hold the data, then the fault lies with the protocol. + return testAndWrite(Addr, DataBlk, pkt); + } +} + +enumeration(DMARequestType, desc="...", default="DMARequestType_NULL") { + READ, desc="Memory Read"; + WRITE, desc="Memory Write"; + NULL, desc="Invalid"; +} + +enumeration(DMAResponseType, desc="...", default="DMAResponseType_NULL") { + DATA, desc="DATA read"; + ACK, desc="ACK write"; + NULL, desc="Invalid"; +} + +structure(DMARequestMsg, desc="...", interface="NetworkMessage") { + DMARequestType Type, desc="Request type (read/write)"; + Address PhysicalAddress, desc="Physical address for this request"; + Address LineAddress, desc="Line address for this request"; + MachineID Requestor, desc="Node who initiated the request"; + NetDest Destination, desc="Destination"; + DataBlock DataBlk, desc="DataBlk attached to this request"; + int Len, desc="The length of the request"; + MessageSizeType MessageSize, desc="size category of the message"; + + bool functionalRead(Packet *pkt) { + return testAndRead(LineAddress, DataBlk, pkt); + } + + bool functionalWrite(Packet *pkt) { + return testAndWrite(LineAddress, DataBlk, pkt); + } +} + +structure(DMAResponseMsg, desc="...", interface="NetworkMessage") { + DMAResponseType Type, desc="Response type (DATA/ACK)"; + Address PhysicalAddress, desc="Physical address for this request"; + Address LineAddress, desc="Line address for this request"; + NetDest Destination, desc="Destination"; + DataBlock DataBlk, desc="DataBlk attached to this request"; + MessageSizeType MessageSize, desc="size category of the message"; + + bool functionalRead(Packet *pkt) { + return testAndRead(LineAddress, DataBlk, pkt); + } + + bool functionalWrite(Packet *pkt) { + return testAndWrite(LineAddress, DataBlk, pkt); + } +} diff -r 3ee9d80f490f -r 7b001aa001f0 src/mem/protocol/MOESI_hammer_bcu.slicc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/mem/protocol/MOESI_hammer_bcu.slicc Wed Dec 02 17:08:48 2015 -0600 @@ -0,0 +1,7 @@ +protocol "MOESI_hammer_bcu"; +include "RubySlicc_interfaces.slicc"; +include "MOESI_hammer_bcu-msg.sm"; +include "MOESI_hammer_bcu-BCU.sm"; +include "MOESI_hammer-GPUcache.sm"; +include "MOESI_hammer-dir.sm"; +include "MOESI_hammer-dma.sm"; \ No newline at end of file diff -r 3ee9d80f490f -r 7b001aa001f0 src/mem/protocol/SConsopts --- a/src/mem/protocol/SConsopts Wed Dec 02 17:08:48 2015 -0600 +++ b/src/mem/protocol/SConsopts Wed Dec 02 17:08:48 2015 -0600 @@ -34,6 +34,7 @@ all_protocols.extend([ 'VI_hammer', + 'MOESI_hammer_bcu', ]) protocol_dirs.append(str(Dir('.').abspath)) # HG changeset patch # User Lena Olson # Date 1449097728 21600 # Node ID 1b6fae7cb423ecdab1ce5ec545f97637e438f884 # Parent 7b001aa001f007c6af4ab6ddfcf2f3b491b108d9 Adds a simple permission table and uses it in the BCU diff -r 7b001aa001f0 -r 1b6fae7cb423 configs/GPUConfig.py --- a/configs/GPUConfig.py Wed Dec 02 17:08:48 2015 -0600 +++ b/configs/GPUConfig.py Wed Dec 02 17:08:48 2015 -0600 @@ -63,6 +63,8 @@ parser.add_option("--gpu_tlb_entries", type="int", default=0, help="Number of entries in GPU TLB. 0 implies infinite") parser.add_option("--gpu_tlb_assoc", type="int", default=0, help="Associativity of the L1 TLB. 0 implies infinite") parser.add_option("--pwc_size", default="8kB", help="Capacity of the page walk cache") + parser.add_option("--plb_size", default=64, help="Entries in the PLB (Border Control Buffer/Cache") + parser.add_option("--plb_alignment", default=0, help="log of addresses per entry in PLB") def configureMemorySpaces(options): total_mem_range = AddrRange(options.total_mem_size) diff -r 7b001aa001f0 -r 1b6fae7cb423 configs/fs_fusion.py --- a/configs/fs_fusion.py Wed Dec 02 17:08:48 2015 -0600 +++ b/configs/fs_fusion.py Wed Dec 02 17:08:48 2015 -0600 @@ -151,8 +151,17 @@ voltage_domain = system.voltage_domain) Ruby.create_system(options, system, system.iobus, system._dma_ports) + system.gpu.ruby = system.ruby system.ruby.clk_domain = system.ruby_clk_domain +if hasattr(system.ruby, "bcu_cntrl"): + system.gpu.shader_mmu.permission_table = system.ruby.bcu_cntrl.perm_table +else: + system.gpu.shader_mmu.permission_table = PermissionTable() + +system.gpu.shader_mmu.permission_table.mem_size = options.total_mem_size +system.gpu.shader_mmu.permission_table.plb_size = options.plb_size +system.gpu.shader_mmu.permission_table.plb_alignment = options.plb_alignment # # Connect CPU ports diff -r 7b001aa001f0 -r 1b6fae7cb423 configs/gpu_protocol/MOESI_hammer_bcu_fusion.py --- a/configs/gpu_protocol/MOESI_hammer_bcu_fusion.py Wed Dec 02 17:08:48 2015 -0600 +++ b/configs/gpu_protocol/MOESI_hammer_bcu_fusion.py Wed Dec 02 17:08:48 2015 -0600 @@ -242,6 +242,7 @@ # BCU cntrl = BorderControlUnit_Controller(version = 0, + perm_table = PermissionTable(), ruby_system = ruby_system) ruby_system.bcu_cntrl = cntrl topology.addController(cntrl) diff -r 7b001aa001f0 -r 1b6fae7cb423 src/gpu/ShaderMMU.py --- a/src/gpu/ShaderMMU.py Wed Dec 02 17:08:48 2015 -0600 +++ b/src/gpu/ShaderMMU.py Wed Dec 02 17:08:48 2015 -0600 @@ -32,6 +32,7 @@ from m5.proxy import * from m5.util import fatal from ClockedObject import ClockedObject +from PermissionTable import PermissionTable class ShaderMMU(ClockedObject): type = 'ShaderMMU' @@ -52,6 +53,8 @@ l2_tlb_entries = Param.Int(0, "Number of entries in the L2 TLB (0=>no L2)") l2_tlb_assoc = Param.Int(4, "Associativity of the L2 TLB (0 => full)") + permission_table = Param.PermissionTable(Parent.any, "Permission Table") + prefetch_buffer_size = Param.Int(0, "Size of the prefetch buffer") def setUpPagewalkers(self, num, port, bypass_l1): diff -r 7b001aa001f0 -r 1b6fae7cb423 src/gpu/shader_mmu.cc --- a/src/gpu/shader_mmu.cc Wed Dec 02 17:08:48 2015 -0600 +++ b/src/gpu/shader_mmu.cc Wed Dec 02 17:08:48 2015 -0600 @@ -51,7 +51,8 @@ ShaderMMU::ShaderMMU(const Params *p) : ClockedObject(p), pagewalkers(p->pagewalkers), latency(p->latency), outstandingFaultStatus(None), curOutstandingWalks(0), - prefetchBufferSize(p->prefetch_buffer_size) + prefetchBufferSize(p->prefetch_buffer_size), + permissionTable(p->permission_table) { activeWalkers.resize(pagewalkers.size()); if (p->l2_tlb_entries > 0) { @@ -91,6 +92,10 @@ l2hits++; req->setPaddr(ppn + offset); req_tlb->insert(vpn, ppn); + if (permissionTable){ + permissionTable->insert(ppn, (mode == BaseTLB::Write)); + //TODO: add in mem write request if returns true + } translation->finish(NoFault, req, tc, mode); return; } @@ -104,6 +109,10 @@ if (tlb) { tlb->insert(vpn, ppn); } + if (permissionTable){ + permissionTable->insert(ppn, (mode == BaseTLB::Write)); + //TODO: add in mem write request if returns true + } req->setPaddr(ppn + offset); req_tlb->insert(vpn, ppn); translation->finish(NoFault, req, tc, mode); @@ -209,6 +218,13 @@ list::iterator it; list &walks = outstandingWalks[vpn]; DPRINTF(ShaderMMU, "Walk satifies %d outstanding reqs\n", walks.size()); + if (permissionTable && !translation->prefetch){ + TheISA::TlbEntry *entry; + assert(translation->pageWalker); + entry = translation->pageWalker->lookup(vpn, false); + permissionTable->insert(ppn, entry->writable); + //TODO add in memory write request on miss? + } for (it = walks.begin(); it != walks.end(); it++) { TranslationRequest *t = (*it); diff -r 7b001aa001f0 -r 1b6fae7cb423 src/gpu/shader_mmu.hh --- a/src/gpu/shader_mmu.hh Wed Dec 02 17:08:48 2015 -0600 +++ b/src/gpu/shader_mmu.hh Wed Dec 02 17:08:48 2015 -0600 @@ -44,6 +44,7 @@ #include "sim/clocked_object.hh" #include "sim/faults.hh" #include "sim/tlb.hh" +#include "mem/ruby/PermissionTable.hh" class ShaderMMU : public ClockedObject { @@ -151,6 +152,8 @@ // Insert prefetch into prefetch buffer void insertPrefetch(Addr vpn, Addr ppn); + PermissionTable * permissionTable; + public: /// Constructor typedef ShaderMMUParams Params; diff -r 7b001aa001f0 -r 1b6fae7cb423 src/mem/protocol/MOESI_hammer_bcu-BCU.sm --- a/src/mem/protocol/MOESI_hammer_bcu-BCU.sm Wed Dec 02 17:08:48 2015 -0600 +++ b/src/mem/protocol/MOESI_hammer_bcu-BCU.sm Wed Dec 02 17:08:48 2015 -0600 @@ -1,28 +1,64 @@ +/* + * Copyright (c) 2014 Mark D. Hill and David A. Wood + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * AMD's contributions to the MOESI hammer protocol do not constitute an + * endorsement of its similarity to any AMD products. + * + * Authors: Lena Olson + * Jason Power + */ - +structure (PermissionTable, external="yes") { + void logAddress(Address, bool); + bool checkPLB(Address, bool); + bool checkTable(Address, bool); +} machine(BorderControlUnit, "Border control unit") -: Cycles latency := 1; - - // NOTE: I'm pretty sure the directory to cache side is not necessary. - // But we'll just never send messages to this controller over these - // networks. +: PermissionTable * perm_table; + Cycles hit_latency := 10; + Cycles miss_latency := 100; // Interface with the cache (copied from MOESI_hammer-dir.sm) - // MessageBuffer * forwardFromDirToCache, network="To", virtual_network="3", ordered="false", vnet_type="forward"; - // MessageBuffer * responseFromDirToCache, network="To", virtual_network="4", ordered="false", vnet_type="response"; - - MessageBuffer * unblockToDirFromCache, network="From", virtual_network="5", ordered="false", vnet_type="unblock"; - MessageBuffer * responseToDirFromCache, network="From", virtual_network="4", ordered="false", vnet_type="response"; - MessageBuffer * requestToDirFromCache, network="From", virtual_network="2", ordered="false", vnet_type="request", recycle_latency="1"; + MessageBuffer * unblockToDirFromCache, network="From", virtual_network="5", + ordered="false", vnet_type="unblock"; + MessageBuffer * responseToDirFromCache, network="From", virtual_network="4", + ordered="false", vnet_type="response"; + MessageBuffer * requestToDirFromCache, network="From", virtual_network="2", + ordered="false", vnet_type="request", recycle_latency="1"; // Interface with the directory (copied from MOESI_hammer-cache.sm) - MessageBuffer * requestFromCacheToDir, network="To", virtual_network="2", ordered="false", vnet_type="request"; - MessageBuffer * responseFromCacheToDir, network="To", virtual_network="4", ordered="false", vnet_type="response"; - MessageBuffer * unblockFromCacheToDir, network="To", virtual_network="5", ordered="false", vnet_type="unblock"; + MessageBuffer * requestFromCacheToDir, network="To", virtual_network="2", + ordered="false", vnet_type="request"; + MessageBuffer * responseFromCacheToDir, network="To", virtual_network="4", + ordered="false", vnet_type="response"; + MessageBuffer * unblockFromCacheToDir, network="To", virtual_network="5", + ordered="false", vnet_type="unblock"; - // MessageBuffer * forwardToCacheFromDir, network="From", virtual_network="3", ordered="false", vnet_type="forward"; - // MessageBuffer * responseToCacheFromDir, network="From", virtual_network="4", ordered="false", vnet_type="response"; { state_declaration(State, desc="Cache states") { @@ -30,10 +66,8 @@ } enumeration(Event, desc="BCU events") { - //FwdFromDir; RespFromCache; UnblockFromCache; - //ResponseFromDir; RequestFromCache; } @@ -57,8 +91,6 @@ void setState(Address addr, State state) { } - //out_port(fwdNetwork_out, RequestMsg, forwardFromDirToCache); - //out_port(responseNetworkToCache_out, ResponseMsg, responseFromDirToCache); out_port(requestNetwork_out, RequestMsg, requestFromCacheToDir); out_port(responseNetworkToDir_out, ResponseMsg, responseFromCacheToDir); out_port(unblockNetwork_out, ResponseMsg, unblockFromCacheToDir); @@ -76,14 +108,6 @@ } } - // in_port(respFromDir_in, ResponseMsg, responseToCacheFromDir) { - // if (respFromDir_in.isReady()) { - // peek(respFromDir_in, ResponseMsg) { - // trigger(Event:ResponseFromDir, in_msg.Addr); - // } - // } - // } - in_port(respFromCache_in, ResponseMsg, responseToDirFromCache) { if (respFromCache_in.isReady()) { peek(respFromCache_in, ResponseMsg) { @@ -99,37 +123,34 @@ } } } - - // in_port(fwdFromDir_in, ResponseMsg, forwardToCacheFromDir) { - // if (fwdFromDir_in.isReady()) { - // peek(fwdFromDir_in, ResponseMsg) { - // trigger(Event:FwdFromDir, in_msg.Addr); - // } - // } - // } - - // action(ff_fwdtocache, "ff", desc="FwdFromDir") { - // peek(fwdFromDir_in, ResponseMsg) { - // enqueue(fwdNetwork_out, ResponseMsg, latency) { - // out_msg.Addr := in_msg.Addr; - // out_msg.Type := in_msg.Type; - // out_msg.Sender := in_msg.Sender; - // out_msg.CurOwner := in_msg.CurOwner; - // out_msg.Destination := in_msg.OriginalDestination; - // out_msg.DataBlk := in_msg.DataBlk; - // out_msg.Dirty := in_msg.Dirty; - // out_msg.Acks := in_msg.Acks; - // out_msg.MessageSize := in_msg.MessageSize; - // out_msg.InitialRequestTime := in_msg.InitialRequestTime; - // out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; - // out_msg.SilentAcks := in_msg.SilentAcks; - // } - // } - // } action(rsd_resptodir, "rtd", desc="RespFromCache") { peek(respFromCache_in, ResponseMsg) { DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr); + Cycles latency := hit_latency; + /*Do we need read permission*/ + if (in_msg.Type == CoherenceResponseType:ACK_SHARED){ + perm_table.logAddress(in_msg.Addr, false); + bool ret := perm_table.checkPLB(in_msg.Addr, false); + if (ret != true) { + perm_table.checkTable(in_msg.Addr, false); + latency := miss_latency; + } + } + /*Do we need write permission*/ + else if (in_msg.Type == CoherenceResponseType:DATA || + in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE || + in_msg.Type == CoherenceResponseType:DATA_SHARED){ + /*With this coherence protocol, hard to tell if this was an + innocent read that got O or what. Use dirty bit.*/ + perm_table.logAddress(in_msg.Addr, true); + bool ret := perm_table.checkPLB(in_msg.Addr, in_msg.Dirty); + if (ret != true) { + perm_table.checkTable(in_msg.Addr, true); + latency := miss_latency; + } + } + enqueue(responseNetworkToDir_out, ResponseMsg, latency) { out_msg.Addr := in_msg.Addr; out_msg.Type := in_msg.Type; @@ -147,29 +168,34 @@ } } - // action(rsc_resptocache, "rsc", desc="ResponseFromDir") { - // peek(respFromDir_in, ResponseMsg) { - // enqueue(responseNetworkToCache_out, ResponseMsg, latency) { - // out_msg.Addr := in_msg.Addr; - // out_msg.Type := in_msg.Type; - // out_msg.Sender := in_msg.Sender; - // out_msg.CurOwner := in_msg.CurOwner; - // out_msg.Destination := in_msg.OriginalDestination; - // out_msg.DataBlk := in_msg.DataBlk; - // out_msg.Dirty := in_msg.Dirty; - // out_msg.Acks := in_msg.Acks; - // out_msg.MessageSize := in_msg.MessageSize; - // out_msg.InitialRequestTime := in_msg.InitialRequestTime; - // out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; - // out_msg.SilentAcks := in_msg.SilentAcks; - // } - // } - // } - action(uc_unblocktodir, "uc", desc="UnblockFromCache") { peek(unblock_in, ResponseMsg) { DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr); - enqueue(unblockNetwork_out, ResponseMsg, latency) { + Cycles latency := hit_latency; + /*Do we need read permission*/ + if (in_msg.Type == CoherenceResponseType:UNBLOCK || + in_msg.Type == CoherenceResponseType:UNBLOCKS || + in_msg.Type == CoherenceResponseType:WB_CLEAN || + in_msg.Type == CoherenceResponseType:UNBLOCKM || + in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_CLEAN){ + perm_table.logAddress(in_msg.Addr, false); + bool ret := perm_table.checkPLB(in_msg.Addr, false); + if (ret != true) { + perm_table.checkTable(in_msg.Addr, false); + latency := miss_latency; + } + } + /*Do we need write permission*/ + else if (in_msg.Type == CoherenceResponseType:WB_DIRTY || + in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_DIRTY){ + perm_table.logAddress(in_msg.Addr, true); + bool ret := perm_table.checkPLB(in_msg.Addr, true); + if (ret != true) { + perm_table.checkTable(in_msg.Addr, true); + latency := miss_latency; + } + } + enqueue(unblockNetwork_out, ResponseMsg, latency) { out_msg.Addr := in_msg.Addr; out_msg.Type := in_msg.Type; out_msg.Sender := in_msg.Sender; @@ -189,6 +215,29 @@ action(rqc_reqtodir, "rqc", desc="RequestFromCache") { peek(requestFromCache_in, RequestMsg) { DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr); + Cycles latency := hit_latency; + /*Do we need read permission*/ + if (in_msg.Type == CoherenceRequestType:GETX || + in_msg.Type == CoherenceRequestType:GETS || + in_msg.Type == CoherenceRequestType:MERGED_GETS || + in_msg.Type == CoherenceRequestType:GETF){ + perm_table.logAddress(in_msg.Addr, false); + bool ret := perm_table.checkPLB(in_msg.Addr, false); + if (ret != true) { + perm_table.checkTable(in_msg.Addr, false); + latency := miss_latency; + } + } + /*Do we need write permission*/ + else if (in_msg.Type == CoherenceRequestType:PUT || + in_msg.Type == CoherenceRequestType:PUTF){ + perm_table.logAddress(in_msg.Addr, true); + bool ret := perm_table.checkPLB(in_msg.Addr, true); + if (ret != true) { + perm_table.checkTable(in_msg.Addr, true); + latency := miss_latency; + } + } enqueue(requestNetwork_out, RequestMsg, latency) { out_msg.Addr := in_msg.Addr; out_msg.Type := in_msg.Type; @@ -208,10 +257,6 @@ unblock_in.dequeue(); } - // action(prfd_poprespFromDir, "prfd", desc="") { - // respFromDir_in.dequeue(); - // } - action(prfc_poprespFromCache, "prfc", desc="") { respFromCache_in.dequeue(); } @@ -220,16 +265,6 @@ requestFromCache_in.dequeue(); } - // action(pf_popfwdFromDir, "pf", desc="") { - // fwdFromDir_in.dequeue(); - // } - - - // transition({I}, {FwdFromDir}) { - // ff_fwdtocache; - // pf_popfwdFromDir; - // } - transition({I}, {RespFromCache}) { rsd_resptodir; prfc_poprespFromCache; @@ -240,13 +275,8 @@ pu_popunblock; } - // transition({I}, {ResponseFromDir}) { - // rsc_resptocache; - // prfd_poprespFromDir; - // } - transition({I}, {RequestFromCache}) { rqc_reqtodir; pq_popreqFromCache; } -} \ No newline at end of file +} diff -r 7b001aa001f0 -r 1b6fae7cb423 src/mem/protocol/SConsopts --- a/src/mem/protocol/SConsopts Wed Dec 02 17:08:48 2015 -0600 +++ b/src/mem/protocol/SConsopts Wed Dec 02 17:08:48 2015 -0600 @@ -40,3 +40,4 @@ protocol_dirs.append(str(Dir('.').abspath)) slicc_includes.append('mem/ruby/RubySlicc_GPUMappings.hh') +slicc_includes.append('mem/ruby/PermissionTable.hh') diff -r 7b001aa001f0 -r 1b6fae7cb423 src/mem/ruby/PermissionTable.cc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/mem/ruby/PermissionTable.cc Wed Dec 02 17:08:48 2015 -0600 @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2014 Mark D. Hill and David A. Wood + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Lena Olson, Jason Power + * + */ + + +#include "debug/PermissionTable.hh" +#include "PermissionTable.hh" + +PermissionTable::PermissionTable(const Params *p) : + ClockedObject(p), mem_size(p->mem_size), plb_size(p->plb_size), + plb_alignment(p->plb_alignment) +{ + page_shift = 12; //because I'm too dumb to use TheISA + + // init based on params giving size of mem + + // bitmap has 2 bits per page (page is 4kB) + permission_bitmap = std::vector((mem_size >> page_shift) * 2, false); + + // PLB is currently a list, so we don't have to touch that + + plb_entry_size = 1 << plb_alignment; + printf("plb alignment is %lu, plb_entry_size is %lu\n", plb_alignment, plb_entry_size); +} + +void PermissionTable::logAddress(Address addr, bool isWrite) +{ + DPRINTF(PermissionTable, "Addr %#x, %s\n", addr.getAddress(), isWrite ? "W" : "R"); +} + +/* After ATS translates address, insert it into permission table & cache. + * Should only be called from ATS (shaderMMU) + * Returns true if anything was inserted, since then a memory request + * should be generated. */ +/* addr should be full-length address*/ +bool PermissionTable::insert(Addr addr, bool hasWrite) +{ + // first check that the address is valid + assert((addr & 0x7f) == 0); + assert (addr < mem_size); + Address plb_tag = getPLBTag(Address(addr)); + Addr ppn = addr >> page_shift; + uint32_t offset = (ppn % plb_entry_size) * 2; + + plb_entry entry = popPLBEntry(plb_tag); + if (entry.first.getAddress() == 0){ + //not in PLB - insert it + hasWrite ? m_plb_write_insert_miss++ : m_plb_read_insert_miss++; + if (plb_size > 0){ + //update with this entry + entry.first = plb_tag; + //initialize vector + entry.second = std::vector(plb_entry_size * 2, false); + entry.second[offset] = true; + entry.second[offset + 1] = hasWrite; + + //add it to PLB + plb.push_front(entry); + + while (plb.size() > plb_size){ //PLB is full + plb.pop_back(); + m_plb_evict++; + } + } + // finally, insert into table + writeTable(Address(addr), hasWrite); + + return true; //so slicc can generate mem req + } + + assert(plb_alignment || entry.second[offset]); + // is the block already in the plb? If so, push it to the front + // check permissions + if (!entry.second[offset] || (hasWrite && !entry.second[offset+1])){ + hasWrite ? m_plb_write_partial_hit++ : m_plb_read_partial_hit++; + //bad permissions - update entry + entry.second[offset] = true; + entry.second[offset+1] = hasWrite; + //update table + writeTable(Address(addr), hasWrite); + } + else { + hasWrite ? m_plb_write_insert_hit++ : m_plb_read_insert_hit++; + } + + //Add entry back to front of list + plb.push_front(entry); + return false; +} + +/* Expects that page_addr is full-length address*/ +bool PermissionTable::checkPLB(Address addr, bool isWriteback) +{ + if (plb_size <= 0){ + return false; + } + Address plb_tag = getPLBTag(addr); + Addr ppn = addr.shiftLowOrderBits(page_shift); + uint32_t offset = (ppn % plb_entry_size) * 2; + + plb_entry entry = popPLBEntry(plb_tag); + if (entry.first.getAddress() == 0){ + isWriteback ? m_plb_write_miss++ : m_plb_read_miss++; + return false; + } + + if (isWriteback && !entry.second[offset+1]){ + printf("lena: Writeback for non-writable block? %llx\n", addr.getAddress()); + //In this case, we want to check the table + m_plb_write_mismatch++; + return false; + } + + isWriteback ? m_plb_write_hit++ : m_plb_read_hit++; + //update LRU + plb.push_front(entry); + return true; +} + +/*Helper functions for dealing with PLB*/ +/* requires pre-shifted address (tag) */ +PermissionTable::plb_entry PermissionTable::popPLBEntry(Address plb_tag){ + plb_entry ret; + ret.first = Address(0); + //get addr aligned + for (auto it = plb.begin(); it != plb.end(); it++){ + if (it->first == plb_tag){ + ret = *it; + plb.erase(it); + return ret; + } + } + return ret; +} + +Address PermissionTable::getPLBTag(Address addr){ + return Address(addr.shiftLowOrderBits(plb_alignment + page_shift)); +} + +/* Requires full-length address*/ +bool PermissionTable::checkTable(Address addr, bool isWriteback) +{ + Address plb_tag = getPLBTag(addr); + m_table_read++; + bool writeable = checkTableWrite(addr); + bool readable = checkTableRead(addr); + if (!readable || (isWriteback && !writeable)){ + printf("lena: Warning: checkTable failed for %llx %s\n", addr.getAddress(), isWriteback ? "W" : "R"); + } + + if (plb_size > 0) { + //update the PLB + //Get the corresponding entry from the table + plb_entry entry = getTableBlock(plb_tag); + + plb.push_front(entry); + + while (plb.size() > plb_size){ //PLB is full + plb.pop_back(); + m_plb_evict++; + } + } + + return isWriteback ? writeable : readable; +} + + +/* Helper functions for dealing with the table */ +void PermissionTable::writeTable(Address addr, bool hasWrite){ + m_table_write++; + Addr ppn = addr.shiftLowOrderBits(page_shift); + permission_bitmap[(ppn*2)] = true; + permission_bitmap[(ppn*2)+1] = permission_bitmap[(ppn*2)+1] | hasWrite; +} + +bool PermissionTable::checkTableRead(Address addr){ + Addr ppn = addr.shiftLowOrderBits(page_shift); + return permission_bitmap[(ppn * 2)]; +} + +bool PermissionTable::checkTableWrite(Address addr){ + Addr ppn = addr.shiftLowOrderBits(page_shift); + return permission_bitmap[(ppn * 2) + 1]; +} + +Address PermissionTable::getTableAddress(Address addr){ + assert(false); // can't remember why I wrote this function + //get byte address + Address a(addr.shiftLowOrderBits(page_shift) / 4); + //make into block address + a.makeLineAddress(); + return a; +} + +PermissionTable::plb_entry PermissionTable::getTableBlock(Address plb_tag){ + //printf("Get Table Block %llx\n", plb_tag.getAddress()); + //plb_tag is the first bits of the ppn; need to refill 0s + Addr ppn = plb_tag.getAddress() << plb_alignment; + plb_entry ret; + ret.first = plb_tag; + +#if 0 + printf("ppn*2 is %lx, (ppn+plb_entry_size)*2 is %lx\n", ppn*2, (ppn+plb_entry_size)*2); + + auto table_first = permission_bitmap.begin(); + auto table_last = permission_bitmap.begin(); + std::advance(table_first, ppn*2); + std::advance(table_last, (ppn+plb_entry_size)*2); + + ret.second.resize(plb_entry_size*2, 0); + + //do the copy + std::copy(table_first, table_last, ret.second.begin()); + assert(ret.second.size() == plb_entry_size*2); +#endif + + for (int i = 0; i < plb_entry_size*2; i++){ + ret.second.push_back(permission_bitmap[ppn*2+i]); + } + assert(ret.second.size() == plb_entry_size*2); + return ret; +} + +// STATS LIVE HERE +void +PermissionTable::regStats() +{ + + m_plb_read_hit + .name(name() + ".plb_read_hit") + .desc("Number of PLB read hits") + ; + + m_plb_write_hit + .name(name() + ".plb_write_hit") + .desc("Number of PLB write hits") + ; + + m_plb_read_miss + .name(name() + ".plb_read_miss") + .desc("Number of PLB read misses") + ; + + m_plb_write_miss + .name(name() + ".plb_write_miss") + .desc("Number of PLB write misses") + ; + + m_plb_read_insert_hit + .name(name() + ".plb_read_insert_hit") + .desc("Number of PLB insert with read hits") + ; + + m_plb_write_insert_hit + .name(name() + ".plb_write_insert_hit") + .desc("Number of PLB insert with write hits") + ; + + m_plb_read_insert_miss + .name(name() + ".plb_read_insert_miss") + .desc("Number of PLB insert with read misses") + ; + + m_plb_write_insert_miss + .name(name() + ".plb_write_insert_miss") + .desc("Number of PLB insert with write misses") + ; + + m_plb_read_partial_hit + .name(name() + ".plb_read_partial_hit") + .desc("Read insert with block hit, page miss") + ; + + m_plb_write_partial_hit + .name(name() + ".plb_write_partial_hit") + .desc("Write insert with block hit, page miss") + ; + + + + m_plb_write_mismatch + .name(name() + ".plb_write_mismatch") + .desc("Number of times we missed plb_write checks") + ; + + m_plb_evict + .name(name() + ".plb_evict") + .desc("Number of PLB evictions") + ; + + m_table_read + .name(name() + ".table_read") + .desc("Number of permission table reads") + ; + + m_table_write + .name(name() + ".table_write") + .desc("Number of permission table writes") + ; +} + +PermissionTable * +PermissionTableParams::create() +{ + return new PermissionTable(this); +} + diff -r 7b001aa001f0 -r 1b6fae7cb423 src/mem/ruby/PermissionTable.hh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/mem/ruby/PermissionTable.hh Wed Dec 02 17:08:48 2015 -0600 @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2012-2013 Mark D. Hill and David A. Wood + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Lena Olson, Jason Power + * + */ + +#ifndef __PERMISSION_TABLE_HH__ +#define __PERMISSION_TABLE_HH__ + +#include "base/statistics.hh" +#include "config/the_isa.hh" +#include "mem/ruby/common/Address.hh" +#include "mem/ruby/system/System.hh" +#include "params/PermissionTable.hh" +#include "sim/clocked_object.hh" + +class PermissionTable : public ClockedObject +{ + protected: + typedef PermissionTableParams Params; + + public: + PermissionTable(const Params *p); + + void logAddress(Address addr, bool isWrite); + + bool insert(Addr page_addr, bool hasWrite); + + bool checkPLB(Address addr, bool isWriteback); + + bool checkTable(Address addr, bool isWriteback); + + // What 64-byte block is this address found in? + Address getTableAddress(Address addr); + + + private: + + uint64_t mem_size; + uint32_t plb_size; + uint64_t plb_alignment; //if x, 2^x addresses per entry + uint64_t plb_entry_size; + + // Why is this hardcoded? Easier to manage storage when we only know size + typedef std::pair > plb_entry; + + // cache / buffer (s) (address, write-permission) + // at runtime. + std::list plb; + + + // bitmap: pairs of read-write bits. Read is first. + std::vector permission_bitmap; + + + // PLB helper functions + plb_entry popPLBEntry(Address addr); + Address getPLBTag(Address addr); + + // table (bitmap) helper functions + void writeTable(Address addr, bool hasWrite); + bool checkTableRead(Address addr); + bool checkTableWrite(Address addr); + plb_entry getTableBlock(Address addr); + + uint64_t page_shift; //can't get ISA to work + + + + + // STATS LIVE HERE + public: + void regStats(); + + Stats::Scalar m_plb_read_hit; + Stats::Scalar m_plb_write_hit; + Stats::Scalar m_plb_read_miss; + Stats::Scalar m_plb_write_miss; + + Stats::Scalar m_plb_read_insert_hit; + Stats::Scalar m_plb_write_insert_hit; + Stats::Scalar m_plb_read_insert_miss; + Stats::Scalar m_plb_write_insert_miss; + Stats::Scalar m_plb_read_partial_hit; + Stats::Scalar m_plb_write_partial_hit; + + Stats::Scalar m_plb_evict; + Stats::Scalar m_plb_write_mismatch; + + Stats::Scalar m_table_read; + Stats::Scalar m_table_write; + +}; + +#endif // __PERMISSION_TABLE_HH__ diff -r 7b001aa001f0 -r 1b6fae7cb423 src/mem/ruby/PermissionTable.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/mem/ruby/PermissionTable.py Wed Dec 02 17:08:48 2015 -0600 @@ -0,0 +1,40 @@ +# Copyright (c) 2014 Mark D. Hill and David A. Wood +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Jason Power +# Lena Olson +# + +from ClockedObject import ClockedObject +from m5.params import * + +class PermissionTable(ClockedObject): + type = 'PermissionTable' + cxx_class = 'PermissionTable' + cxx_header = "src/mem/ruby/PermissionTable.hh" + mem_size = Param.MemorySize("1GB", "physical memory capacity in bytes") + plb_size = Param.Int(64, "PLB size in bytes") + plb_alignment = Param.Int(0, "Log2 of # addresses per PLB entry") diff -r 7b001aa001f0 -r 1b6fae7cb423 src/mem/ruby/SConscript --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/mem/ruby/SConscript Wed Dec 02 17:08:48 2015 -0600 @@ -0,0 +1,57 @@ +# -*- mode:python -*- + +# Copyright (c) 2011 Mark D. Hill and David A. Wood +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +Import('*') + +SimObject('PermissionTable.py') + +Source('PermissionTable.cc') + +DebugFlag('PermissionTable') + +from os.path import basename, isdir, join as joinpath + +# +# Link includes for SLICC ('cause it's dumb!!) +# +generated_dir = Dir('../../mem/protocol') + +def MakeIncludeAction(target, source, env): + f = file(str(target[0]), 'w') + for s in source: + print >>f, '#include "%s"' % str(s.abspath) + f.close() + +def MakeInclude(source): + target = generated_dir.File(basename(source)) + include_action1 = MakeAction(MakeIncludeAction, Transform("MAKE INC", 1)) + env.Command(target, source, include_action1) + +# Since this is a SLICC external object +MakeInclude('PermissionTable.hh') \ No newline at end of file # HG changeset patch # User Lena Olson # Date 1449097729 21600 # Node ID 4c279b99f8f872610c293d4581e2849ccbae9827 # Parent 1b6fae7cb423ecdab1ce5ec545f97637e438f884 imported patch extend-perm-table-VI diff -r 1b6fae7cb423 -r 4c279b99f8f8 configs/gpu_protocol/VI_hammer.py --- a/configs/gpu_protocol/VI_hammer.py Wed Dec 02 17:08:48 2015 -0600 +++ b/configs/gpu_protocol/VI_hammer.py Wed Dec 02 17:08:49 2015 -0600 @@ -224,6 +224,6 @@ # Connect the dma controller to the network dma_cntrl.responseFromDir = ruby_system.network.master - dma_cntrl.requestToDir = ruby_system.network.slave + dma_cntrl.reqToDirectory = ruby_system.network.slave return (cpu_sequencers, dir_cntrl_nodes, dma_cntrl_nodes, topology) diff -r 1b6fae7cb423 -r 4c279b99f8f8 configs/gpu_protocol/VI_hammer_bcu.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/configs/gpu_protocol/VI_hammer_bcu.py Wed Dec 02 17:08:49 2015 -0600 @@ -0,0 +1,8 @@ + +def define_options(parser): + parser.add_option("--allow-atomic-migration", action="store_true", + help="allow migratory sharing for atomic only accessed blocks") + parser.add_option("--pf-on", action="store_true", + help="Hammer: enable Probe Filter") + parser.add_option("--dir-on", action="store_true", + help="Hammer: enable Full-bit Directory") \ No newline at end of file diff -r 1b6fae7cb423 -r 4c279b99f8f8 configs/gpu_protocol/VI_hammer_bcu_fusion.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/configs/gpu_protocol/VI_hammer_bcu_fusion.py Wed Dec 02 17:08:49 2015 -0600 @@ -0,0 +1,297 @@ +# Copyright (c) 2006-2007 The Regents of The University of Michigan +# Copyright (c) 2009 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Brad Beckmann + +import math +import m5 +import VI_hammer +from m5.objects import * +from m5.defines import buildEnv +from Cluster import Cluster + +# +# Note: the L1 Cache latency is only used by the sequencer on fast path hits +# +class L1Cache(RubyCache): + latency = 1 + +# +# Note: the L2 Cache latency is not currently used +# +class L2Cache(RubyCache): + latency = 15 + +def create_system(options, system, dma_devices, ruby_system): + + if not buildEnv['GPGPU_SIM']: + m5.util.panic("This script requires GPGPU-Sim integration to be built.") + + # Run the protocol script to setup CPU cluster, directory and DMA + (all_sequencers, dir_cntrls, dma_cntrls, cpu_cluster) = \ + VI_hammer.create_system(options, + system, + dma_devices, + ruby_system) + + cpu_cntrl_count = len(cpu_cluster) + len(dir_cntrls) + + # + # Build GPU cluster + # + gpu_cluster = Cluster(intBW = 32, extBW = 32) + gpu_cluster.disableConnectToParent() + + l2_bits = int(math.log(options.num_l2caches, 2)) + block_size_bits = int(math.log(options.cacheline_size, 2)) + # This represents the L1 to L2 interconnect latency + # NOTE! This latency is in Ruby (cache) cycles, not SM cycles + per_hop_interconnect_latency = 45 # ~15 GPU cycles + num_dance_hall_hops = int(math.log(options.num_sc, 2)) + if num_dance_hall_hops == 0: + num_dance_hall_hops = 1 + l1_to_l2_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops + + # + # Caches for GPU cores + # + for i in xrange(options.num_sc): + # + # First create the Ruby objects associated with the GPU cores + # + cache = L1Cache(size = options.sc_l1_size, + assoc = options.sc_l1_assoc, + replacement_policy = "LRU", + start_index_bit = block_size_bits, + dataArrayBanks = 4, + tagArrayBanks = 4, + dataAccessLatency = 4, + tagAccessLatency = 4, + resourceStalls = False) + + l1_cntrl = GPUL1Cache_Controller(version = i, + cache = cache, + l2_select_num_bits = l2_bits, + num_l2 = options.num_l2caches, + issue_latency = l1_to_l2_noc_latency, + number_of_TBEs = options.gpu_l1_buf_depth, + ruby_system = ruby_system) + + gpu_seq = RubySequencer(version = options.num_cpus + i, + icache = cache, + dcache = cache, + access_phys_mem = True, + max_outstanding_requests = options.gpu_l1_buf_depth, + ruby_system = ruby_system, + deadlock_threshold = 2000000, + connect_to_io = False) + + l1_cntrl.sequencer = gpu_seq + + exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i) + + # + # Add controllers and sequencers to the appropriate lists + # + all_sequencers.append(gpu_seq) + gpu_cluster.add(l1_cntrl) + + # Connect the controller to the network + l1_cntrl.requestFromL1Cache = ruby_system.network.slave + l1_cntrl.atomicRequestFromL1Cache = ruby_system.network.slave + l1_cntrl.responseToL1Cache = ruby_system.network.master + + l2_index_start = block_size_bits + l2_bits + # Use L2 cache and interconnect latencies to calculate protocol latencies + # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles + l2_cache_access_latency = 30 # ~10 GPU cycles + l2_to_l1_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops + l2_to_mem_noc_latency = 125 # ~40 GPU cycles + + l2_clusters = [] + for i in xrange(options.num_l2caches): + # + # First create the Ruby objects associated with this cpu + # + l2_cache = L2Cache(size = options.sc_l2_size, + assoc = options.sc_l2_assoc, + start_index_bit = l2_index_start, + replacement_policy = "LRU", + dataArrayBanks = 4, + tagArrayBanks = 4, + dataAccessLatency = 4, + tagAccessLatency = 4, + resourceStalls = options.gpu_l2_resource_stalls) + + l2_cntrl = GPUL2Cache_Controller(version = i, + L2cache = l2_cache, + l2_response_latency = l2_cache_access_latency + + l2_to_l1_noc_latency, + l2_request_latency = l2_to_mem_noc_latency, + ruby_system = ruby_system) + + exec("ruby_system.l2_cntrl%d = l2_cntrl" % i) + l2_cluster = Cluster(intBW = 32, extBW = 32) + l2_cluster.add(l2_cntrl) + gpu_cluster.add(l2_cluster) + l2_clusters.append(l2_cluster) + + # Connect the controller to the network + l2_cntrl.responseToL1Cache = ruby_system.network.slave + l2_cntrl.requestFromCache = ruby_system.network.slave + l2_cntrl.responseFromCache = ruby_system.network.slave + l2_cntrl.unblockFromCache = ruby_system.network.slave + + l2_cntrl.requestFromL1Cache = ruby_system.network.master + l2_cntrl.atomicRequestFromL1Cache = ruby_system.network.master + l2_cntrl.forwardToCache = ruby_system.network.master + l2_cntrl.responseToCache = ruby_system.network.master + + ############################################################################ + # Pagewalk cache + # NOTE: We use a CPU L1 cache controller here. This is to facilatate MMU + # cache coherence (as the GPU L1 caches are incoherent without flushes + # The L2 cache is small, and should have minimal affect on the + # performance (see Section 6.2 of Power et al. HPCA 2014). + pwd_cache = L1Cache(size = options.pwc_size, + assoc = 16, # 64 is fully associative @ 8kB + replacement_policy = "LRU", + start_index_bit = block_size_bits, + latency = 8, + resourceStalls = False) + # Small cache since CPU L1 requires I and D + pwi_cache = L1Cache(size = "512B", + assoc = 2, + replacement_policy = "LRU", + start_index_bit = block_size_bits, + latency = 8, + resourceStalls = False) + + # Small cache since CPU L1 controller requires L2 + l2_cache = L2Cache(size = "512B", + assoc = 2, + start_index_bit = block_size_bits, + latency = 1, + resourceStalls = False) + + l1_cntrl = L1Cache_Controller(version = options.num_cpus, + L1Icache = pwi_cache, + L1Dcache = pwd_cache, + L2cache = l2_cache, + send_evictions = False, + issue_latency = l1_to_l2_noc_latency, + cache_response_latency = 1, + l2_cache_hit_latency = 1, + number_of_TBEs = options.gpu_l1_buf_depth, + ruby_system = ruby_system) + + cpu_seq = RubySequencer(version = options.num_cpus + options.num_sc, + icache = pwd_cache, # Never get data from pwi_cache + dcache = pwd_cache, + access_phys_mem = True, + max_outstanding_requests = options.gpu_l1_buf_depth, + ruby_system = ruby_system, + deadlock_threshold = 2000000, + connect_to_io = False) + + l1_cntrl.sequencer = cpu_seq + + + ruby_system.l1_pw_cntrl = l1_cntrl + all_sequencers.append(cpu_seq) + + gpu_cluster.add(l1_cntrl) + + # Connect the L1 controller and the network + # Connect the buffers from the controller to network + l1_cntrl.requestFromCache = ruby_system.network.slave + l1_cntrl.responseFromCache = ruby_system.network.slave + l1_cntrl.unblockFromCache = ruby_system.network.slave + + # Connect the buffers from the network to the controller + l1_cntrl.forwardToCache = ruby_system.network.master + l1_cntrl.responseToCache = ruby_system.network.master + + + # + # Create controller for the copy engine to connect to in GPU cluster + # Cache is unused by controller + # + cache = L1Cache(size = "4096B", assoc = 2) + + gpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc+1, + icache = cache, + dcache = cache, + access_phys_mem = True, + max_outstanding_requests = 64, + support_inst_reqs = False, + ruby_system = ruby_system, + connect_to_io = False) + + gpu_ce_cntrl = GPUCopyDMA_Controller(version = 0, + sequencer = gpu_ce_seq, + number_of_TBEs = 256, + ruby_system = ruby_system) + + ruby_system.l1_cntrl_ce = gpu_ce_cntrl + + all_sequencers.append(gpu_ce_seq) + + gpu_ce_cntrl.responseFromDir = ruby_system.network.master + gpu_ce_cntrl.reqToDirectory = ruby_system.network.slave + + # BCU + bcu_cntrl = BorderControlUnit_Controller(version = 0, + perm_table = PermissionTable(), + ruby_system = ruby_system) + ruby_system.bcu_cntrl = bcu_cntrl + + bcu_cntrl.unblockToDirFromCache = ruby_system.network.master + bcu_cntrl.responseToDirFromCache = ruby_system.network.master + bcu_cntrl.requestToDirFromCache = ruby_system.network.master + + # Connect the buffers from the network to the controller + bcu_cntrl.requestFromCacheToDir = ruby_system.network.slave + bcu_cntrl.responseFromCacheToDir = ruby_system.network.slave + bcu_cntrl.unblockFromCacheToDir = ruby_system.network.slave + + complete_cluster = Cluster(intBW = 32, extBW = 32) + complete_cluster.add(bcu_cntrl) + complete_cluster.add(gpu_ce_cntrl) + complete_cluster.add(cpu_cluster) + complete_cluster.add(gpu_cluster) + + for cntrl in dir_cntrls: + complete_cluster.add(cntrl) + + for cntrl in dma_cntrls: + complete_cluster.add(cntrl) + + for cluster in l2_clusters: + complete_cluster.add(cluster) + + return (all_sequencers, dir_cntrls, complete_cluster) diff -r 1b6fae7cb423 -r 4c279b99f8f8 src/mem/protocol/MOESI_hammer_bcu-BCU.sm --- a/src/mem/protocol/MOESI_hammer_bcu-BCU.sm Wed Dec 02 17:08:48 2015 -0600 +++ b/src/mem/protocol/MOESI_hammer_bcu-BCU.sm Wed Dec 02 17:08:49 2015 -0600 @@ -139,10 +139,10 @@ } /*Do we need write permission*/ else if (in_msg.Type == CoherenceResponseType:DATA || - in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE || - in_msg.Type == CoherenceResponseType:DATA_SHARED){ - /*With this coherence protocol, hard to tell if this was an - innocent read that got O or what. Use dirty bit.*/ + in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE || + in_msg.Type == CoherenceResponseType:DATA_SHARED){ + /*With this coherence protocol, hard to tell if this was an + innocent read that got O or what. Use dirty bit.*/ perm_table.logAddress(in_msg.Addr, true); bool ret := perm_table.checkPLB(in_msg.Addr, in_msg.Dirty); if (ret != true) { @@ -175,9 +175,9 @@ /*Do we need read permission*/ if (in_msg.Type == CoherenceResponseType:UNBLOCK || in_msg.Type == CoherenceResponseType:UNBLOCKS || - in_msg.Type == CoherenceResponseType:WB_CLEAN || - in_msg.Type == CoherenceResponseType:UNBLOCKM || - in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_CLEAN){ + in_msg.Type == CoherenceResponseType:WB_CLEAN || + in_msg.Type == CoherenceResponseType:UNBLOCKM || + in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_CLEAN){ perm_table.logAddress(in_msg.Addr, false); bool ret := perm_table.checkPLB(in_msg.Addr, false); if (ret != true) { @@ -187,7 +187,7 @@ } /*Do we need write permission*/ else if (in_msg.Type == CoherenceResponseType:WB_DIRTY || - in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_DIRTY){ + in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_DIRTY){ perm_table.logAddress(in_msg.Addr, true); bool ret := perm_table.checkPLB(in_msg.Addr, true); if (ret != true) { @@ -195,7 +195,7 @@ latency := miss_latency; } } - enqueue(unblockNetwork_out, ResponseMsg, latency) { + enqueue(unblockNetwork_out, ResponseMsg, latency) { out_msg.Addr := in_msg.Addr; out_msg.Type := in_msg.Type; out_msg.Sender := in_msg.Sender; @@ -215,29 +215,29 @@ action(rqc_reqtodir, "rqc", desc="RequestFromCache") { peek(requestFromCache_in, RequestMsg) { DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr); - Cycles latency := hit_latency; - /*Do we need read permission*/ - if (in_msg.Type == CoherenceRequestType:GETX || - in_msg.Type == CoherenceRequestType:GETS || - in_msg.Type == CoherenceRequestType:MERGED_GETS || - in_msg.Type == CoherenceRequestType:GETF){ - perm_table.logAddress(in_msg.Addr, false); - bool ret := perm_table.checkPLB(in_msg.Addr, false); - if (ret != true) { - perm_table.checkTable(in_msg.Addr, false); - latency := miss_latency; - } - } - /*Do we need write permission*/ - else if (in_msg.Type == CoherenceRequestType:PUT || - in_msg.Type == CoherenceRequestType:PUTF){ - perm_table.logAddress(in_msg.Addr, true); - bool ret := perm_table.checkPLB(in_msg.Addr, true); - if (ret != true) { - perm_table.checkTable(in_msg.Addr, true); - latency := miss_latency; - } - } + Cycles latency := hit_latency; + /*Do we need read permission*/ + if (in_msg.Type == CoherenceRequestType:GETX || + in_msg.Type == CoherenceRequestType:GETS || + in_msg.Type == CoherenceRequestType:MERGED_GETS || + in_msg.Type == CoherenceRequestType:GETF){ + perm_table.logAddress(in_msg.Addr, false); + bool ret := perm_table.checkPLB(in_msg.Addr, false); + if (ret != true) { + perm_table.checkTable(in_msg.Addr, false); + latency := miss_latency; + } + } + /*Do we need write permission*/ + else if (in_msg.Type == CoherenceRequestType:PUT || + in_msg.Type == CoherenceRequestType:PUTF){ + perm_table.logAddress(in_msg.Addr, true); + bool ret := perm_table.checkPLB(in_msg.Addr, true); + if (ret != true) { + perm_table.checkTable(in_msg.Addr, true); + latency := miss_latency; + } + } enqueue(requestNetwork_out, RequestMsg, latency) { out_msg.Addr := in_msg.Addr; out_msg.Type := in_msg.Type; diff -r 1b6fae7cb423 -r 4c279b99f8f8 src/mem/protocol/SConsopts --- a/src/mem/protocol/SConsopts Wed Dec 02 17:08:48 2015 -0600 +++ b/src/mem/protocol/SConsopts Wed Dec 02 17:08:49 2015 -0600 @@ -34,6 +34,7 @@ all_protocols.extend([ 'VI_hammer', + 'VI_hammer_bcu', 'MOESI_hammer_bcu', ]) diff -r 1b6fae7cb423 -r 4c279b99f8f8 src/mem/protocol/VI_hammer-msg.sm --- a/src/mem/protocol/VI_hammer-msg.sm Wed Dec 02 17:08:48 2015 -0600 +++ b/src/mem/protocol/VI_hammer-msg.sm Wed Dec 02 17:08:49 2015 -0600 @@ -100,6 +100,7 @@ MachineID Requestor, desc="Node who initiated the request"; NetDest MergedRequestors, desc="Merge set of read requestors"; NetDest Destination, desc="Multicast destination mask"; + NetDest OriginalDestination, desc="Multicast destination mask"; MessageSizeType MessageSize, desc="size category of the message"; bool DirectedProbe, default="false", desc="probe filter directed probe"; Cycles InitialRequestTime, default="Cycles(0)", desc="time the initial requests was sent from the L1Cache"; @@ -140,6 +141,7 @@ MachineID Sender, desc="Node who sent the data"; MachineID CurOwner, desc="current owner of the block, used for UnblockS responses"; NetDest Destination, desc="Node to whom the data is sent"; + NetDest OriginalDestination, desc="Multicast destination mask"; DataBlock DataBlk, desc="data for the cache line"; bool Dirty, desc="Is the data dirty (different than memory)?"; int Acks, default="0", desc="How many messages this counts as"; diff -r 1b6fae7cb423 -r 4c279b99f8f8 src/mem/protocol/VI_hammer_bcu-GPUL2cache.sm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/mem/protocol/VI_hammer_bcu-GPUL2cache.sm Wed Dec 02 17:08:49 2015 -0600 @@ -0,0 +1,1529 @@ + +machine(GPUL2Cache, "Simple write back L2 cache") + : CacheMemory * L2cache; + Cycles l2_request_latency := 260; + Cycles l2_response_latency := 2; + Cycles cache_response_latency := 260; + +//Note: we might have a problem if two Get atomics arrive from different L1's at the same time + + + // NETWORK BUFFERS + // Buffers to and from L1 caches + MessageBuffer * requestFromL1Cache, network="From", virtual_network="7", + ordered="true", vnet_type="request"; + MessageBuffer * responseToL1Cache, network="To", virtual_network="6", + ordered="true", vnet_type="response"; + MessageBuffer * atomicRequestFromL1Cache, network="From", virtual_network="8", + ordered="true", vnet_type="request"; + + // Buffers to / from the dir and other caches + MessageBuffer * requestFromCache, network="To", virtual_network="2", + ordered="false", vnet_type="request"; + MessageBuffer * responseFromCache, network="To", virtual_network="4", + ordered="false", vnet_type="response"; + MessageBuffer * unblockFromCache, network="To", virtual_network="5", + ordered="false", vnet_type="unblock"; + + MessageBuffer * forwardToCache, network="From", virtual_network="3", + ordered="false", vnet_type="forward"; + MessageBuffer * responseToCache, network="From", virtual_network="4", + ordered="false", vnet_type="response"; + +{ + // STATES + state_declaration(State, desc="Cache states") { + I, AccessPermission:Invalid, desc="Idle"; + S, AccessPermission:Read_Only, desc="Shared"; + O, AccessPermission:Read_Only, desc="Owned"; + M, AccessPermission:Read_Only, desc="Modified (dirty)"; + MM, AccessPermission:Read_Write, desc="Modified (dirty and locally modified)"; + + // States for atomics + MM_A, AccessPermission:Busy, "MM^A", desc="Done an atomic get, waiting for the atomic put"; + IM_A, AccessPermission:Busy, "IM^A", desc="Done an atomic get, like IM"; + SM_A, AccessPermission:Busy, "SM^A", desc="Done an atomic get, like SM"; + OM_A, AccessPermission:Busy, "OM^A", desc="Done an atomic get, like OM"; + SM_AA, AccessPermission:Busy, "SM^AA", desc="Waiting for final acks"; + IM_AA, AccessPermission:Busy, "IM^AA", desc="Waiting for final acks"; + + // Transient states (from hammer) + IM, AccessPermission:Busy, "IM", desc="Issued GetX"; + ISM, AccessPermission:Read_Only, "ISM", desc="Issued GetX, received valid data, waiting for all acks"; + SM, AccessPermission:Read_Only, "SM", desc="Issued GetX, we still have a valid copy of the line"; + OM, AccessPermission:Read_Only, "OM", desc="Issued GetX, received data"; + IS, AccessPermission:Busy, "IS", desc="Issued GetS"; + SS, AccessPermission:Read_Only, "SS", desc="Issued GetS, received data, waiting for all acks"; + OI, AccessPermission:Busy, "OI", desc="Issued PutO, waiting for ack"; + MI, AccessPermission:Busy, "MI", desc="Issued PutX, waiting for ack"; + II, AccessPermission:Busy, "II", desc="Issued PutX/O, saw Other_GETS or Other_GETX, waiting for ack"; + + M_W, AccessPermission:Read_Only, "M^W", desc="Issued GetS, received exclusive data, waiting for acks"; + MM_W, AccessPermission:Read_Write, "MM^W", desc="Issued GetX, received exclusive data"; + } + + // EVENTS + enumeration(Event, desc="Cache events") { + // From L1 + Get, desc="Get request from L1"; + Store, desc="Put request from L1"; + Replacement, desc="Replace a block"; + Get_Atom, desc="Atomic get request from L1"; + Put_Atom, desc="Atomic put request from L1"; + + // From CPU caches + Other_GETX, desc="A GetX from another processor"; + Other_GETS, desc="A GetS from another processor"; + Merged_GETS, desc="A Merged GetS from another processor"; + NC_DMA_GETS, desc="special GetS when only DMA exists"; + Invalidate, desc="Invalidate block"; + + // ??? + Block_Ack, desc="the directory is blocked and ready for the flush"; + + // From dir + Ack, desc="Received an ack message"; + Shared_Ack, desc="Received an ack message, responder has a shared copy"; + Data, desc="Received a data message"; + Shared_Data, desc="Received a data message, responder has a shared copy"; + Exclusive_Data, desc="Received a data message, responder had an exclusive copy, they gave it to us"; + + Writeback_Ack, desc="Writeback O.K. from directory"; + Writeback_Nack, desc="Writeback not O.K. from directory"; + + // triggers + All_acks, desc="Received all required data and message acks"; + All_acks_no_sharers, desc="Received all acks and no other processor has a shared copy"; + } + + enumeration(RequestType, desc="Type of request for each transition") { + DataArrayRead, desc="L2 Data array read"; + DataArrayWrite, desc="L2 Data array write"; + TagArrayRead, desc="L2 Tag array read"; + TagArrayWrite, desc="L2 Tag array write"; + } + + // STRUCTURE DEFINITIONS + + // CacheEntry + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + DataBlock DataBlk, desc="Data in the block"; + } + + + // TBE fields + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; + bool Sharers, desc="On a GetS, did we find any other sharers in the system"; + bool AppliedSilentAcks, default="false", desc="for full-bit dir, does the pending msg count reflect the silent acks"; + MachineID LastResponder, desc="last machine to send a response for this request"; + MachineID CurOwner, desc="current owner of the block, used for UnblockS responses"; + Cycles InitialRequestTime, default="Cycles(0)", desc="time the initial requests was sent from the L1Cache"; + Cycles ForwardRequestTime, default="Cycles(0)", desc="time the dir forwarded the request"; + Cycles FirstResponseTime, default="Cycles(0)", desc="the Cycles the first response was received"; + + DataBlock DirtyDataBlk, desc="Dirty data for a write. Separate from DataBlk since that's 'clean' data from other caches"; + int Offset, desc="Offset of write into line"; + int Size, desc="Size of the write"; + + MachineID Requestor, desc="The requestor for this block"; + } + + structure(TBETable, external="yes") { + TBE lookup(Address); + void allocate(Address); + void deallocate(Address); + bool isPresent(Address); + } + + + // STRUCTURES + + TBETable TBEs, template="", constructor="m_number_of_TBEs"; + + // PROTOTYPES + void set_cache_entry(AbstractCacheEntry a); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + + // For hammer + void wakeUpBuffers(Address a); + void wakeUpAllBuffers(); + Cycles curCycle(); + + Entry getCacheEntry(Address address), return_by_pointer="yes" { + return static_cast(Entry, "pointer", L2cache.lookup(address)); + } + + State getState(TBE tbe, Entry cache_entry, Address addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } + else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + else { + return State:I; + } + } + + void setState(TBE tbe, Entry cache_entry, Address addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + AccessPermission getAccessPermission(Address addr) { + TBE tbe := TBEs[addr]; + if(is_valid(tbe)) { + return GPUL2Cache_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return GPUL2Cache_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Address addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(GPUL2Cache_State_to_permission(state)); + } + } + + DataBlock getDataBlock(Address addr), return_by_ref="yes" { + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return cache_entry.DataBlk; + } + + TBE tbe := TBEs[addr]; + if(is_valid(tbe)) { + return tbe.DataBlk; + } + + error("Missing data block"); + } + + Event L1Cache_request_type_to_event(CoherenceRequestTypeVI type, Address addr, + MachineID requestor, Entry cache_entry) { + if(type == CoherenceRequestTypeVI:GET) { + return Event:Get; + } else if (type == CoherenceRequestTypeVI:PUT) { + return Event:Store; + } else if (type == CoherenceRequestTypeVI:GET_Atom) { + return Event:Get_Atom; + } else if (type == CoherenceRequestTypeVI:PUT_Atom) { + return Event:Put_Atom; + }else { + error("Invalid L1 request type"); + } + } + + void recordRequestType(RequestType type, Address addr) { + if (type == RequestType:DataArrayRead) { + L2cache.recordRequestType(CacheRequestType:DataArrayRead); + } else if (type == RequestType:DataArrayWrite) { + L2cache.recordRequestType(CacheRequestType:DataArrayWrite); + } else if (type == RequestType:TagArrayRead) { + L2cache.recordRequestType(CacheRequestType:TagArrayRead); + } else if (type == RequestType:TagArrayWrite) { + L2cache.recordRequestType(CacheRequestType:TagArrayWrite); + } else { + error("Bad request type passed to recordRequestType"); + } + } + + bool checkResourceAvailable(RequestType type, Address addr) { + if (type == RequestType:DataArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (type == RequestType:DataArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (type == RequestType:TagArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (type == RequestType:TagArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Bad request type passed to checkResourceAvailable"); + } + } + + MessageBuffer triggerQueue, ordered="false"; + + // NETWORK PORTS + + out_port(responseNetworkL1_out, ResponseMsgVI, responseToL1Cache); + + out_port(requestNetwork_out, RequestMsg, requestFromCache); + out_port(unblockNetwork_out, ResponseMsg, unblockFromCache); + out_port(responseNetwork_out, ResponseMsg, responseFromCache); + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + + // Trigger Queue + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=3) { + if (triggerQueue_in.isReady()) { + peek(triggerQueue_in, TriggerMsg) { + + Entry cache_entry := getCacheEntry(in_msg.Addr); + TBE tbe := TBEs[in_msg.Addr]; + + if (in_msg.Type == TriggerType:ALL_ACKS) { + trigger(Event:All_acks, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == TriggerType:ALL_ACKS_NO_SHARERS) { + trigger(Event:All_acks_no_sharers, in_msg.Addr, cache_entry, tbe); + } else { + error("Unexpected message"); + } + } + } + } + + in_port(responseToCache_in, ResponseMsg, responseToCache, rank=2) { + if (responseToCache_in.isReady()) { + peek(responseToCache_in, ResponseMsg, block_on="Addr") { + + Entry cache_entry := getCacheEntry(in_msg.Addr); + TBE tbe := TBEs[in_msg.Addr]; + + if (in_msg.Type == CoherenceResponseType:ACK) { + trigger(Event:Ack, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:ACK_SHARED) { + trigger(Event:Shared_Ack, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:DATA) { + trigger(Event:Data, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:DATA_SHARED) { + trigger(Event:Shared_Data, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE) { + trigger(Event:Exclusive_Data, in_msg.Addr, cache_entry, tbe); + } else { + error("Unexpected message"); + } + } + } + } + // Forward Network + in_port(forwardToCache_in, RequestMsg, forwardToCache, rank=1) { + if (forwardToCache_in.isReady()) { + peek(forwardToCache_in, RequestMsg, block_on="Addr") { + + Entry cache_entry := getCacheEntry(in_msg.Addr); + TBE tbe := TBEs[in_msg.Addr]; + + if ((in_msg.Type == CoherenceRequestType:GETX) || (in_msg.Type == CoherenceRequestType:GETF)) { + trigger(Event:Other_GETX, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:MERGED_GETS) { + trigger(Event:Merged_GETS, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:GETS) { + if (machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache) > 1) { + trigger(Event:Other_GETS, in_msg.Addr, cache_entry, tbe); + } else { + trigger(Event:NC_DMA_GETS, in_msg.Addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:INV) { + trigger(Event:Invalidate, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WB_ACK) { + trigger(Event:Writeback_Ack, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WB_NACK) { + trigger(Event:Writeback_Nack, in_msg.Addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:BLOCK_ACK) { + trigger(Event:Block_Ack, in_msg.Addr, cache_entry, tbe); + } else { + error("Unexpected message"); + } + } + } + } + + in_port(requestQueue_in, RequestMsgVI, requestFromL1Cache, desc="...") { + if (requestQueue_in.isReady()) { + peek(requestQueue_in, RequestMsgVI, block_on="Addr") { + + Entry cache_entry := getCacheEntry(in_msg.Addr); + if (is_invalid(cache_entry) && + L2cache.cacheAvail(in_msg.Addr) == false ) { + // make room for the block + trigger(Event:Replacement, L2cache.cacheProbe(in_msg.Addr), + getCacheEntry(L2cache.cacheProbe(in_msg.Addr)), + TBEs[L2cache.cacheProbe(in_msg.Addr)]); + } + else { + trigger(L1Cache_request_type_to_event(in_msg.Type, in_msg.Addr, + in_msg.Requestor, cache_entry), + in_msg.Addr, cache_entry, TBEs[in_msg.Addr]); + } + } + } + } + + in_port(atomicRequestQueue_in, RequestMsgVI, atomicRequestFromL1Cache, desc="...") { + if (atomicRequestQueue_in.isReady()) { + peek(atomicRequestQueue_in, RequestMsgVI, block_on="Addr") { + Entry cache_entry := getCacheEntry(in_msg.Addr); + assert(is_valid(cache_entry)); + trigger(Event:Put_Atom, in_msg.Addr, cache_entry, TBEs[in_msg.Addr]); + } + } + } + + // ACTIONS + + action(a_issueGETS, "a", desc="Issue GETS") { + enqueue(requestNetwork_out, RequestMsg, l2_request_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceRequestType:GETS; + out_msg.Requestor := machineID; + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + tbe.NumPendingMsgs := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache); // One from each other cache (n-1) plus the memory (+1) + } + } + + action(b_issueGETX, "b", desc="Issue GETX") { + enqueue(requestNetwork_out, RequestMsg, l2_request_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceRequestType:GETX; + out_msg.Requestor := machineID; + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + tbe.NumPendingMsgs := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache); // One from each other cache (n-1) plus the memory (+1) + } + } + + action(d_issuePUT, "d", desc="Issue PUT") { + enqueue(requestNetwork_out, RequestMsg, l2_request_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceRequestType:PUT; + out_msg.Requestor := machineID; + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + + action(ii_allocateL2CacheBlock, "\i", desc="Allocate a cache block") { + if (is_valid(cache_entry)) { + } else { + set_cache_entry(L2cache.allocate(address, new Entry)); + } + } + + action(rr_deallocateL2CacheBlock, "\r", desc="deallocate a cache block") { + if (is_valid(cache_entry)) { + L2cache.deallocate(address); + unset_cache_entry(); + } + } + + action(rq_popL1IncomingQueue, "rq", desc="Pop the L1 request queue") { + requestQueue_in.dequeue(); + } + + action(n_popResponseQueue, "n", desc="Pop the response queue") { + responseToCache_in.dequeue(); + } + + action(aq_popL1AtomicQueue, "aq", desc="Pop the atomic L1 request queue") { + atomicRequestQueue_in.dequeue(); + } + + action(j_popTriggerQueue, "j", desc="Pop trigger queue.") { + triggerQueue_in.dequeue(); + } + + action(l_popForwardQueue, "l", desc="Pop forwareded request queue.") { + forwardToCache_in.dequeue(); + } + + action(h_load_hit, "h", desc="Send data to L1.") { + assert(is_valid(cache_entry)); + peek(requestQueue_in, RequestMsgVI) { + enqueue(responseNetworkL1_out, ResponseMsgVI, l2_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseTypeVI:DATA; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + ++L2cache.demand_hits; + } + + action(ha_load_hit, "ha", desc="Send data to L1 for atomic") { + assert(is_valid(cache_entry)); + enqueue(responseNetworkL1_out, ResponseMsgVI, l2_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseTypeVI:DATA; + out_msg.Sender := machineID; + out_msg.Destination.add(tbe.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(hx_external_load_hit, "hx", desc="load required external msgs, send data to L1") { + assert(is_valid(cache_entry)); + assert(is_valid(tbe)); + peek(responseToCache_in, ResponseMsg) { + enqueue(responseNetworkL1_out, ResponseMsgVI, l2_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseTypeVI:DATA; + out_msg.Sender := in_msg.Sender; + out_msg.Destination.add(tbe.Requestor); + out_msg.DataBlk := in_msg.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + } + + action(hh_store_hit, "\h", desc="Notify L1 that store completed.") { + assert(is_valid(cache_entry)); + peek(requestQueue_in, RequestMsgVI) { + cache_entry.DataBlk.copyPartial(in_msg.DataBlk, in_msg.Offset, in_msg.Size); + cache_entry.Dirty := true; + } + ++L2cache.demand_hits; + DPRINTF(RubySlicc, "%s %s\n", address, cache_entry.DataBlk); + } + + action(sx_external_store_hit, "sx", desc="store required external msgs, Notify L1 that store completed.") { + assert(is_valid(cache_entry)); + assert(is_valid(tbe)); + cache_entry.DataBlk.copyPartial(tbe.DirtyDataBlk, tbe.Offset, tbe.Size); + cache_entry.Dirty := true; + peek(responseToCache_in, ResponseMsg) { + if (machineIDToMachineType(in_msg.Sender) == MachineType:Directory) { + //profileGPUL2WriteMiss(GenericMachineType:Directory); + } + } + DPRINTF(RubySlicc, "From L1: %s %s\n", address, tbe.DirtyDataBlk); + DPRINTF(RubySlicc, "%s: offset: %d, size: %d\n", address, tbe.Offset, tbe.Size); + DPRINTF(RubySlicc, "%s %s\n", address, cache_entry.DataBlk); + } + + action(sxt_trig_ext_store_hit, "sxt", desc="store required external msgs, Notify L1 that store completed.") { + assert(is_valid(cache_entry)); + assert(is_valid(tbe)); + cache_entry.DataBlk.copyPartial(tbe.DirtyDataBlk, tbe.Offset, tbe.Size); + cache_entry.Dirty := true; + if (machineIDToMachineType(tbe.LastResponder) == MachineType:Directory) { + //profileGPUL2WriteMiss(GenericMachineType:Directory); + } else if (machineIDToMachineType(tbe.LastResponder) == MachineType:L1Cache) { + //profileGPUL2WriteMiss(GenericMachineType:L1Cache_wCC); + } else if (machineIDToMachineType(tbe.LastResponder) == MachineType:GPUL2Cache) { + //profileGPUL2WriteMiss(GenericMachineType:L1Cache_wCC); + } else { + error("Only expect responses from Directory, L1Cache or GPUL2Cache"); + } + DPRINTF(RubySlicc, "From L1: %s %s\n", address, tbe.DirtyDataBlk); + DPRINTF(RubySlicc, "%s: offset: %d, size: %d\n", address, tbe.Offset, tbe.Size); + DPRINTF(RubySlicc, "%s %s\n", address, cache_entry.DataBlk); + } + + action(sa_store_hit, "sa", desc="Notify L1 that an atomic store completed.") { + assert(is_valid(cache_entry)); + peek(atomicRequestQueue_in, RequestMsgVI) { + enqueue(responseNetworkL1_out, ResponseMsgVI, l2_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseTypeVI:WB_ACK; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + ++L2cache.demand_hits; + } + + action(as_ackStore, "as", desc="Ack the requestor that the store is complete") { + peek(requestQueue_in, RequestMsgVI) { + enqueue(responseNetworkL1_out, ResponseMsgVI, l2_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseTypeVI:WB_ACK; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := MessageSizeType:Writeback_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(aes_ackExternalStore, "aes", desc="Ack the requestor that the store is complete") { + assert(is_valid(tbe)); + enqueue(responseNetworkL1_out, ResponseMsgVI, l2_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseTypeVI:WB_ACK; + out_msg.Sender := machineID; + out_msg.Destination.add(tbe.Requestor); + out_msg.MessageSize := MessageSizeType:Writeback_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + DPRINTF(RubySlicc, "%s %s\n", address, tbe.Requestor); + } + } + + action(es_recordRequestor, "es", desc="record the requestor ID in the TBE") { + assert(is_valid(tbe)); + peek(requestQueue_in, RequestMsgVI) { + tbe.Requestor := in_msg.Requestor; + tbe.DirtyDataBlk := in_msg.DataBlk; + tbe.Offset := in_msg.Offset; + tbe.Size := in_msg.Size; + DPRINTF(RubySlicc, "Recording requestor %s %s\n", address, in_msg.Requestor); + } + } + + action(u_writeDataToCache, "u", desc="Write data to cache") { + peek(responseToCache_in, ResponseMsg) { + assert(is_valid(cache_entry)); + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(i_allocateTBE, "i", desc="Allocate TBE") { + check_allocate(TBEs); + assert(is_valid(cache_entry)); + TBEs.allocate(address); + set_tbe(TBEs[address]); + tbe.DataBlk := cache_entry.DataBlk; // Data only used for writebacks + tbe.Dirty := cache_entry.Dirty; + tbe.Sharers := false; + } + + action(s_deallocateTBE, "s", desc="Deallocate TBE") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(z_stall, "z", desc="Stall") { + // empty + } + + action(zz_stallAndWaitRequestQueue, "\z", desc="...") { + stall_and_wait(requestQueue_in, address); + } + + action(m_decrementNumberOfMessages, "m", desc="Decrement the number of messages for which we're waiting") { + peek(responseToCache_in, ResponseMsg) { + assert(in_msg.Acks >= 0); + assert(is_valid(tbe)); + DPRINTF(RubySlicc, "Sender = %s\n", in_msg.Sender); + DPRINTF(RubySlicc, "SilentAcks = %d\n", in_msg.SilentAcks); + if (tbe.AppliedSilentAcks == false) { + tbe.NumPendingMsgs := tbe.NumPendingMsgs - in_msg.SilentAcks; + tbe.AppliedSilentAcks := true; + } + DPRINTF(RubySlicc, "%d\n", tbe.NumPendingMsgs); + tbe.NumPendingMsgs := tbe.NumPendingMsgs - in_msg.Acks; + DPRINTF(RubySlicc, "%d\n", tbe.NumPendingMsgs); + APPEND_TRANSITION_COMMENT(tbe.NumPendingMsgs); + APPEND_TRANSITION_COMMENT(in_msg.Sender); + tbe.LastResponder := in_msg.Sender; + if (tbe.InitialRequestTime != zero_time() && in_msg.InitialRequestTime != zero_time()) { + assert(tbe.InitialRequestTime == in_msg.InitialRequestTime); + } + if (in_msg.InitialRequestTime != zero_time()) { + tbe.InitialRequestTime := in_msg.InitialRequestTime; + } + if (tbe.ForwardRequestTime != zero_time() && in_msg.ForwardRequestTime != zero_time()) { + assert(tbe.ForwardRequestTime == in_msg.ForwardRequestTime); + } + if (in_msg.ForwardRequestTime != zero_time()) { + tbe.ForwardRequestTime := in_msg.ForwardRequestTime; + } + if (tbe.FirstResponseTime == zero_time()) { + tbe.FirstResponseTime := curCycle(); + } + } + } + + action(o_checkForCompletion, "o", desc="Check if we have received all the messages required for completion") { + assert(is_valid(tbe)); + if (tbe.NumPendingMsgs == 0) { + enqueue(triggerQueue_out, TriggerMsg) { + out_msg.Addr := address; + if (tbe.Sharers) { + out_msg.Type := TriggerType:ALL_ACKS; + } else { + out_msg.Type := TriggerType:ALL_ACKS_NO_SHARERS; + } + } + } + } + + action(uo_updateCurrentOwner, "uo", desc="When moving SS state, update current owner.") { + peek(responseToCache_in, ResponseMsg) { + assert(is_valid(tbe)); + tbe.CurOwner := in_msg.Sender; + } + } + + action(p_decrementNumberOfMessagesByOne, "p", desc="Decrement the number of messages for which we're waiting by one") { + assert(is_valid(tbe)); + tbe.NumPendingMsgs := tbe.NumPendingMsgs - 1; + } + + action(pp_incrementNumberOfMessagesByOne, "\p", desc="Increment the number of messages for which we're waiting by one") { + assert(is_valid(tbe)); + tbe.NumPendingMsgs := tbe.NumPendingMsgs + 1; + } + + action(kd_wakeUpDependents, "kd", desc="wake-up dependents") { + wakeUpBuffers(address); + } + + action(ka_wakeUpAllDependents, "ka", desc="wake-up all dependents") { + wakeUpAllBuffers(); + } + + action(r_setSharerBit, "r", desc="We saw other sharers") { + assert(is_valid(tbe)); + tbe.Sharers := true; + } + + action(gm_sendUnblockM, "gm", desc="Send unblock to memory and indicate M/O/E state") { + enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:UNBLOCKM; + out_msg.Sender := machineID; + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + } + } + + action(gs_sendUnblockS, "gs", desc="Send unblock to memory and indicate S state") { + enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:UNBLOCKS; + out_msg.Sender := machineID; + out_msg.CurOwner := tbe.CurOwner; + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + } + } + + action(v_writeDataToCacheVerify, "v", desc="Write data to cache, assert it was same as before") { + peek(responseToCache_in, ResponseMsg) { + assert(is_valid(cache_entry)); + DPRINTF(RubySlicc, "Cached Data Block: %s, Msg Data Block: %s\n", + cache_entry.DataBlk, in_msg.DataBlk); + assert(cache_entry.DataBlk == in_msg.DataBlk); + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty || cache_entry.Dirty; + } + } + + action(q_sendDataFromTBEToCache, "q", desc="Send data from TBE to cache") { + peek(forwardToCache_in, RequestMsg) { + assert(in_msg.Requestor != machineID); + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + DPRINTF(RubySlicc, "%s\n", out_msg.Destination); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache); + } else { + out_msg.Acks := 2; + } + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(sq_sendSharedDataFromTBEToCache, "sq", desc="Send shared data from TBE to cache, still the owner") { + peek(forwardToCache_in, RequestMsg) { + assert(in_msg.Requestor != machineID); + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA_SHARED; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + DPRINTF(RubySlicc, "%s\n", out_msg.Destination); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache); + } else { + out_msg.Acks := 2; + } + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(qm_sendDataFromTBEToCache, "qm", desc="Send data from TBE to cache, multiple sharers, still the owner") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA_SHARED; + out_msg.Sender := machineID; + out_msg.Destination := in_msg.MergedRequestors; + DPRINTF(RubySlicc, "%s\n", out_msg.Destination); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache); + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(qq_sendDataFromTBEToMemory, "\q", desc="Send data from TBE to memory") { + enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Sender := machineID; + out_msg.Destination.broadcast(MachineType:BorderControlUnit); + out_msg.OriginalDestination.add(map_Address_to_Directory(address)); + out_msg.Dirty := tbe.Dirty; + if (tbe.Dirty) { + out_msg.Type := CoherenceResponseType:WB_DIRTY; + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Writeback_Data; + } else { + out_msg.Type := CoherenceResponseType:WB_CLEAN; + // NOTE: in a real system this would not send data. We send + // data here only so we can check it at the memory + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(t_sendExclusiveDataFromTBEToMemory, "t", desc="Send exclusive data from TBE to memory") { + enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(tbe)); + out_msg.Addr := address; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Dirty) { + out_msg.Type := CoherenceResponseType:WB_EXCLUSIVE_DIRTY; + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Writeback_Data; + } else { + out_msg.Type := CoherenceResponseType:WB_EXCLUSIVE_CLEAN; + // NOTE: in a real system this would not send data. We send + // data here only so we can check it at the memory + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(f_sendAck, "f", desc="Send ack from cache to requestor") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:ACK; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Acks := 1; + out_msg.SilentAcks := in_msg.SilentAcks; + assert(in_msg.DirectedProbe == false); + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(ff_sendAckShared, "\f", desc="Send shared ack from cache to requestor") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:ACK_SHARED; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Acks := 1; + out_msg.SilentAcks := in_msg.SilentAcks; + assert(in_msg.DirectedProbe == false); + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(g_sendUnblock, "g", desc="Send unblock to memory") { + enqueue(unblockNetwork_out, ResponseMsg, cache_response_latency) { + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:UNBLOCK; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + } + } + + action(e_sendData, "e", desc="Send data from cache to requestor") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(cache_entry)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Dirty := cache_entry.Dirty; + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache); + } else { + out_msg.Acks := 2; + } + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(ee_sendDataShared, "\e", desc="Send data from cache to requestor, remaining the owner") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(cache_entry)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA_SHARED; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Dirty := cache_entry.Dirty; + DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk); + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache); + } else { + out_msg.Acks := 2; + } + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(c_sendExclusiveData, "c", desc="Send exclusive data from cache to requestor") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(cache_entry)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Dirty := cache_entry.Dirty; + if (in_msg.DirectedProbe) { + out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache); + } else { + out_msg.Acks := 2; + } + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(em_sendDataSharedMultiple, "em", desc="Send data from cache to all requestors, still the owner") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, cache_response_latency) { + assert(is_valid(cache_entry)); + out_msg.Addr := address; + out_msg.Type := CoherenceResponseType:DATA_SHARED; + out_msg.Sender := machineID; + out_msg.Destination := in_msg.MergedRequestors; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Dirty := cache_entry.Dirty; + DPRINTF(RubySlicc, "%s\n", out_msg.DataBlk); + out_msg.Acks := machineCount(MachineType:L1Cache)+machineCount(MachineType:GPUL2Cache); + out_msg.SilentAcks := in_msg.SilentAcks; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + + action(uu_profileWriteMiss, "uu", desc="...") { + ++L2cache.demand_misses; + } + + action(vv_profileReadMiss, "vv", desc="...") { + ++L2cache.demand_misses; + } + + // TRANSITIONS + + transition({IM, IS, OI, MI, II}, {Get, Store, Replacement, Get_Atom}) {} { + zz_stallAndWaitRequestQueue; + } + + transition({ISM,SM,OM,SS}, {Replacement,Store,Get_Atom}) {} { + zz_stallAndWaitRequestQueue; + } + + transition({M_W,MM_W}, {Replacement, Get_Atom}) {} { + zz_stallAndWaitRequestQueue; + } + + transition(I, Store, IM) {TagArrayRead, TagArrayWrite} { + ii_allocateL2CacheBlock; + i_allocateTBE; + es_recordRequestor; + b_issueGETX; + uu_profileWriteMiss; + rq_popL1IncomingQueue; + } + + transition({S,MM,O,M}, Get) {TagArrayRead, DataArrayRead} { + h_load_hit; + rq_popL1IncomingQueue; + } + + transition({SS,M_W,MM_W,SM,OM,ISM}, Get) {DataArrayRead} { + h_load_hit; + rq_popL1IncomingQueue; + } + + transition(MM, Store) {TagArrayRead, DataArrayWrite} { + hh_store_hit; + as_ackStore; + rq_popL1IncomingQueue; + } + + transition(MM_W, Store) {DataArrayWrite} { + hh_store_hit; + as_ackStore; + rq_popL1IncomingQueue; + } + + transition(M, Store, MM) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + hh_store_hit; + as_ackStore; + rq_popL1IncomingQueue; + } + + transition(O, Store, OM) {TagArrayRead} { + i_allocateTBE; + es_recordRequestor; + b_issueGETX; + p_decrementNumberOfMessagesByOne; + uu_profileWriteMiss; + rq_popL1IncomingQueue; + } + + transition(S, Store, SM) {TagArrayRead} { + i_allocateTBE; + es_recordRequestor; + b_issueGETX; + uu_profileWriteMiss; + rq_popL1IncomingQueue; + } + + transition(I, Get, IS) {TagArrayRead} { + ii_allocateL2CacheBlock; + i_allocateTBE; + es_recordRequestor; + a_issueGETS; + vv_profileReadMiss; + rq_popL1IncomingQueue; + } + + // Let's deal with atomics + + transition(MM, Get_Atom, MM_A) { + i_allocateTBE; + es_recordRequestor; + ha_load_hit; + rq_popL1IncomingQueue; + } + + transition(I, Get_Atom, IM_A) { + ii_allocateL2CacheBlock; + i_allocateTBE; + es_recordRequestor; + b_issueGETX; + //uu_profileMiss; // TODO + rq_popL1IncomingQueue; + } + + transition(S, Get_Atom, SM_A) { + i_allocateTBE; + es_recordRequestor; + b_issueGETX; + //uu_profileMiss; // TODO + rq_popL1IncomingQueue; + } + + transition(M, Get_Atom, MM_A) { + i_allocateTBE; + es_recordRequestor; + h_load_hit; + rq_popL1IncomingQueue; + } + + transition(O, Get_Atom, OM_A) { + i_allocateTBE; + es_recordRequestor; + b_issueGETX; + p_decrementNumberOfMessagesByOne; + //uu_profileMiss; // TODO + rq_popL1IncomingQueue; + } + + transition(MM_A, Put_Atom, MM) { + sa_store_hit; + s_deallocateTBE; + aq_popL1AtomicQueue; + } + + transition(SM_A, {Data, Exclusive_Data}, SM_AA) { + v_writeDataToCacheVerify; + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(IM_A, {Data, Exclusive_Data}, IM_AA) { + u_writeDataToCache; + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + //kd_wakeUpDependents; // This is only for ex data, I don't think we need it + } + + transition(OM_A, {All_acks, All_acks_no_sharers}, MM_A) { + ha_load_hit; + gm_sendUnblockM; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + transition(IM_AA, All_acks_no_sharers, MM_A) { + ha_load_hit; + gm_sendUnblockM; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + transition(SM_AA, All_acks_no_sharers, MM_A) { + ha_load_hit; + gm_sendUnblockM; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + transition(SM_AA, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition({IM_A,SM_A,OM_A,MM_A,SM_AA,IM_AA}, {Get,Get_Atom,Store,Replacement,Other_GETX,Other_GETS,Merged_GETS,NC_DMA_GETS,Invalidate}){ + z_stall; + } + + // Transistions for replacements + + transition(I, Replacement) {TagArrayRead} { + rr_deallocateL2CacheBlock; + ka_wakeUpAllDependents; + } + + transition(S, Replacement, I) {TagArrayRead, TagArrayWrite} { + rr_deallocateL2CacheBlock; + ka_wakeUpAllDependents; + } + + transition(O, Replacement, OI) {TagArrayRead} { + i_allocateTBE; + d_issuePUT; + rr_deallocateL2CacheBlock; + ka_wakeUpAllDependents; + } + + transition(O, Merged_GETS) {TagArrayRead, DataArrayRead} { + em_sendDataSharedMultiple; + l_popForwardQueue; + } + + transition({M,MM}, Replacement, MI) {TagArrayRead, DataArrayRead} { + i_allocateTBE; + d_issuePUT; + rr_deallocateL2CacheBlock; + ka_wakeUpAllDependents; + } + + transition(MI, Writeback_Ack, I) {TagArrayWrite} { + t_sendExclusiveDataFromTBEToMemory; + s_deallocateTBE; + l_popForwardQueue; + kd_wakeUpDependents; + } + + transition(OI, Writeback_Ack, I) {TagArrayWrite} { + qq_sendDataFromTBEToMemory; + s_deallocateTBE; + l_popForwardQueue; + kd_wakeUpDependents; + } + + transition({OI, MI}, {Other_GETX, Invalidate}, II) { + q_sendDataFromTBEToCache; + l_popForwardQueue; + } + + transition({OI, MI}, {NC_DMA_GETS, Other_GETS}, OI) { + sq_sendSharedDataFromTBEToCache; + l_popForwardQueue; + } + + transition({OI, MI}, Merged_GETS, OI) { + qm_sendDataFromTBEToCache; + l_popForwardQueue; + } + + // Transitions based on reponses + + // Transitions from IS + + transition(IS, {Other_GETX, NC_DMA_GETS, Other_GETS, Invalidate}) { + f_sendAck; + l_popForwardQueue; + } + + transition(IS, Data, SS) {DataArrayWrite} { + u_writeDataToCache; + m_decrementNumberOfMessages; + o_checkForCompletion; + hx_external_load_hit; + uo_updateCurrentOwner; + n_popResponseQueue; + kd_wakeUpDependents; + } + + transition(IS, Exclusive_Data, M_W) {DataArrayWrite} { + u_writeDataToCache; + m_decrementNumberOfMessages; + o_checkForCompletion; + hx_external_load_hit; + n_popResponseQueue; + kd_wakeUpDependents; + } + + transition(IS, Shared_Data, SS) {DataArrayWrite} { + u_writeDataToCache; + r_setSharerBit; + m_decrementNumberOfMessages; + o_checkForCompletion; + hx_external_load_hit; + uo_updateCurrentOwner; + n_popResponseQueue; + kd_wakeUpDependents; + } + + transition(IS, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(IS, Shared_Ack) { + m_decrementNumberOfMessages; + r_setSharerBit; + o_checkForCompletion; + n_popResponseQueue; + } + + // Transitions from M_W + transition(M_W, Store, MM_W) {DataArrayWrite} { + hh_store_hit; + as_ackStore; + rq_popL1IncomingQueue; + } + + transition(M_W, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(M_W, All_acks_no_sharers, M) {TagArrayWrite} { + gm_sendUnblockM; + s_deallocateTBE; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + // Transitions from MM_W + + transition(MM_W, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(MM_W, All_acks_no_sharers, MM) {TagArrayWrite} { + gm_sendUnblockM; + s_deallocateTBE; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + // Transitions from IM + transition(SM, {NC_DMA_GETS, Other_GETS}) { + ff_sendAckShared; + l_popForwardQueue; + } + + transition(SM, {Other_GETX, Invalidate}, IM) { + f_sendAck; + l_popForwardQueue; + } + + transition(IM, {Other_GETX, NC_DMA_GETS, Other_GETS, Invalidate}) { + f_sendAck; + l_popForwardQueue; + } + + transition({IM, IM_A, IM_AA}, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(IM, Data, ISM) {DataArrayWrite} { + u_writeDataToCache; + m_decrementNumberOfMessages; + o_checkForCompletion; + kd_wakeUpDependents; + n_popResponseQueue; + } + + transition(IM, Exclusive_Data, MM_W) {DataArrayWrite} { + u_writeDataToCache; + m_decrementNumberOfMessages; + o_checkForCompletion; + sx_external_store_hit; + aes_ackExternalStore; + n_popResponseQueue; + kd_wakeUpDependents; + } + + // Transitions from ISM + transition(ISM, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(ISM, All_acks_no_sharers, MM) {DataArrayWrite, TagArrayWrite} { + sxt_trig_ext_store_hit; + aes_ackExternalStore; + gm_sendUnblockM; + s_deallocateTBE; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + // Transitions from SS + transition(SS, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(SS, Shared_Ack) { + m_decrementNumberOfMessages; + r_setSharerBit; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(SS, All_acks, S) {TagArrayWrite} { + gs_sendUnblockS; + s_deallocateTBE; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + transition(SS, All_acks_no_sharers, S) {TagArrayWrite} { + // Note: The directory might still be the owner, so that is why we go to S + gs_sendUnblockS; + s_deallocateTBE; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + // Transitions from OM + + transition(OM, {Other_GETX, Invalidate}, IM) { + e_sendData; + pp_incrementNumberOfMessagesByOne; + l_popForwardQueue; + } + + transition(OM, {NC_DMA_GETS, Other_GETS}) { + ee_sendDataShared; + l_popForwardQueue; + } + + transition(OM, Merged_GETS) { + em_sendDataSharedMultiple; + l_popForwardQueue; + } + + transition({OM, OM_A}, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(OM, {All_acks, All_acks_no_sharers}, MM) {TagArrayWrite, DataArrayWrite} { + sxt_trig_ext_store_hit; + aes_ackExternalStore; + gm_sendUnblockM; + s_deallocateTBE; + j_popTriggerQueue; + kd_wakeUpDependents; + } + + // Transitions from SM + transition({SM, SM_A}, Ack) { + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + transition(SM, {Data, Exclusive_Data}, ISM) { + v_writeDataToCacheVerify; + m_decrementNumberOfMessages; + o_checkForCompletion; + n_popResponseQueue; + } + + // Transitions for other cache requests + + transition(I, {Other_GETX, NC_DMA_GETS, Other_GETS, Invalidate}) {TagArrayRead} { + f_sendAck; + l_popForwardQueue; + } + + transition(S, {NC_DMA_GETS, Other_GETS}) {TagArrayRead} { + ff_sendAckShared; + l_popForwardQueue; + } + + transition(S, {Other_GETX, Invalidate}, I) {TagArrayRead, TagArrayWrite} { + f_sendAck; + l_popForwardQueue; + } + + transition(O, {Other_GETX, Invalidate}, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + e_sendData; + l_popForwardQueue; + } + + transition(O, {NC_DMA_GETS, Other_GETS}) {TagArrayRead, DataArrayRead} { + ee_sendDataShared; + l_popForwardQueue; + } + + transition(MM, {Other_GETX, Invalidate}, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + c_sendExclusiveData; + l_popForwardQueue; + } + + transition(MM, Other_GETS, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + c_sendExclusiveData; + l_popForwardQueue; + } + + transition(MM, NC_DMA_GETS, O) {TagArrayRead, TagArrayWrite, DataArrayRead} { + ee_sendDataShared; + l_popForwardQueue; + } + + transition(MM, Merged_GETS, O) {TagArrayRead, TagArrayWrite, DataArrayRead} { + em_sendDataSharedMultiple; + l_popForwardQueue; + } + + transition(M, {Other_GETX, Invalidate}, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + c_sendExclusiveData; + l_popForwardQueue; + } + + transition(M, Other_GETS, O) {TagArrayRead, TagArrayWrite, DataArrayRead} { + ee_sendDataShared; + l_popForwardQueue; + } + + transition(M, NC_DMA_GETS, O) {TagArrayRead, TagArrayWrite, DataArrayRead} { + ee_sendDataShared; + l_popForwardQueue; + } + + transition(M, Merged_GETS, O) {TagArrayRead, TagArrayWrite, DataArrayRead} { + em_sendDataSharedMultiple; + l_popForwardQueue; + } + + // Transitions from II + transition(II, {NC_DMA_GETS, Other_GETS, Other_GETX, Invalidate}, II) { + f_sendAck; + l_popForwardQueue; + } + + transition(II, Writeback_Ack, I) { + g_sendUnblock; + s_deallocateTBE; + l_popForwardQueue; + kd_wakeUpDependents; + } + + transition(II, Writeback_Nack, I) { + s_deallocateTBE; + l_popForwardQueue; + kd_wakeUpDependents; + } + +} + diff -r 1b6fae7cb423 -r 4c279b99f8f8 src/mem/protocol/VI_hammer_bcu.slicc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/mem/protocol/VI_hammer_bcu.slicc Wed Dec 02 17:08:49 2015 -0600 @@ -0,0 +1,15 @@ +protocol "VI_hammer_bcu"; + +include "RubySlicc_interfaces.slicc"; + +include "VI_hammer-msg.sm"; +include "VI_hammer-CPUCache.sm"; + +include "VI_hammer-GPUL1cache.sm"; +include "VI_hammer_bcu-GPUL2cache.sm"; +include "VI-ce.sm"; + +include "VI_hammer-dir.sm"; +include "VI_hammer-dma.sm"; + +include "MOESI_hammer_bcu-BCU.sm"; # HG changeset patch # User Lena Olson # Date 1449097729 21600 # Node ID ea23ae7dc3ff9da72b844b225d5cf7577b1ee328 # Parent 4c279b99f8f872610c293d4581e2849ccbae9827 Update VI_hammer and BCU such that its sequencers can't restore diff -r 4c279b99f8f8 -r ea23ae7dc3ff configs/gpu_protocol/VI_hammer_bcu_fusion.py --- a/configs/gpu_protocol/VI_hammer_bcu_fusion.py Wed Dec 02 17:08:49 2015 -0600 +++ b/configs/gpu_protocol/VI_hammer_bcu_fusion.py Wed Dec 02 17:08:49 2015 -0600 @@ -108,7 +108,8 @@ max_outstanding_requests = options.gpu_l1_buf_depth, ruby_system = ruby_system, deadlock_threshold = 2000000, - connect_to_io = False) + connect_to_io = False, + can_restore = False) l1_cntrl.sequencer = gpu_seq @@ -216,7 +217,8 @@ max_outstanding_requests = options.gpu_l1_buf_depth, ruby_system = ruby_system, deadlock_threshold = 2000000, - connect_to_io = False) + connect_to_io = False, + can_restore = False) l1_cntrl.sequencer = cpu_seq @@ -250,7 +252,8 @@ max_outstanding_requests = 64, support_inst_reqs = False, ruby_system = ruby_system, - connect_to_io = False) + connect_to_io = False, + can_restore = False) gpu_ce_cntrl = GPUCopyDMA_Controller(version = 0, sequencer = gpu_ce_seq, diff -r 4c279b99f8f8 -r ea23ae7dc3ff configs/gpu_protocol/VI_hammer_fusion.py --- a/configs/gpu_protocol/VI_hammer_fusion.py Wed Dec 02 17:08:49 2015 -0600 +++ b/configs/gpu_protocol/VI_hammer_fusion.py Wed Dec 02 17:08:49 2015 -0600 @@ -108,7 +108,8 @@ max_outstanding_requests = options.gpu_l1_buf_depth, ruby_system = ruby_system, deadlock_threshold = 2000000, - connect_to_io = False) + connect_to_io = False, + can_restore = False) l1_cntrl.sequencer = gpu_seq @@ -216,7 +217,8 @@ max_outstanding_requests = options.gpu_l1_buf_depth, ruby_system = ruby_system, deadlock_threshold = 2000000, - connect_to_io = False) + connect_to_io = False, + can_restore = False) l1_cntrl.sequencer = cpu_seq @@ -250,7 +252,8 @@ max_outstanding_requests = 64, support_inst_reqs = False, ruby_system = ruby_system, - connect_to_io = False) + connect_to_io = False, + can_restore = False) gpu_ce_cntrl = GPUCopyDMA_Controller(version = 0, sequencer = gpu_ce_seq, # HG changeset patch # User Lena Olson # Date 1449097729 21600 # Node ID f3b35cc4f72d2d58f0fcf16b6a62ac1e73d5e742 # Parent ea23ae7dc3ff9da72b844b225d5cf7577b1ee328 imported patch micro_submit_state diff -r ea23ae7dc3ff -r f3b35cc4f72d configs/GPUConfig.py --- a/configs/GPUConfig.py Wed Dec 02 17:08:49 2015 -0600 +++ b/configs/GPUConfig.py Wed Dec 02 17:08:49 2015 -0600 @@ -66,6 +66,12 @@ parser.add_option("--plb_size", default=64, help="Entries in the PLB (Border Control Buffer/Cache") parser.add_option("--plb_alignment", default=0, help="log of addresses per entry in PLB") + parser.add_option("--concurrent_walks", type="int", default=32, help="number of concurrent page walks") + parser.add_option("--pwc_latency", type="int", default=8, help="") + parser.add_option("--pw_l2_latency", type="int", default=0, help="if 0, we get the value from elsewhere") + parser.add_option("--mmu_latency", type="int", default=20, help="") + parser.add_option("--l2_tlb_entries", type="int", default=0, help="") + def configureMemorySpaces(options): total_mem_range = AddrRange(options.total_mem_size) cpu_mem_range = total_mem_range @@ -189,6 +195,7 @@ for sc in gpu.shader_cores: sc.lsq = ShaderLSQ() sc.lsq.data_tlb.entries = options.gpu_tlb_entries + sc.lsq.data_tlb.associativity = options.gpu_tlb_assoc sc.lsq.forward_flush = (buildEnv['PROTOCOL'] == 'VI_hammer_fusion' \ and options.flush_kernel_end) sc.lsq.warp_size = options.gpu_warp_size @@ -231,10 +238,13 @@ # Initialize the MMU, connecting it to either the pagewalk cache port for # unified address space, or the copy engine's host-side sequencer port for # split address space architectures. - gpu.shader_mmu.setUpPagewalkers(32, + gpu.shader_mmu.setUpPagewalkers(options.concurrent_walks, ruby._cpu_ports[options.num_cpus+options.num_sc].slave, options.gpu_tlb_bypass_l1) + gpu.shader_mmu.latency = options.mmu_latency + gpu.shader_mmu.l2_tlb_entries = options.l2_tlb_entries + if options.split: # NOTE: In split address space architectures, the MMU only provides the # copy engine host-side TLB access to a page walker. This should diff -r ea23ae7dc3ff -r f3b35cc4f72d configs/fs_fusion.py --- a/configs/fs_fusion.py Wed Dec 02 17:08:49 2015 -0600 +++ b/configs/fs_fusion.py Wed Dec 02 17:08:49 2015 -0600 @@ -64,6 +64,8 @@ # Ruby.define_options(parser) +parser.add_option("--flush_tick", default=0, type="int") + (options, args) = parser.parse_args() options.ruby = True @@ -141,8 +143,9 @@ system.gpu_physmem = SimpleMemory(range = gpu_mem_range) system.gpu_physmem.port = system.iobus.master -system.gpu.test_tlb_shootdown = True -system.gpu.tlb_shootdown_tick = 5461844154685 + (29247313 / 4) +if options.flush_tick: + system.gpu.test_tlb_shootdown = True + system.gpu.tlb_shootdown_tick = options.flush_tick # # Setup Ruby diff -r ea23ae7dc3ff -r f3b35cc4f72d configs/gpu_config/gpgpusim.config.template --- a/configs/gpu_config/gpgpusim.config.template Wed Dec 02 17:08:49 2015 -0600 +++ b/configs/gpu_config/gpgpusim.config.template Wed Dec 02 17:08:49 2015 -0600 @@ -1,3 +1,5 @@ +-gpgpu_deadlock_detect 0 + # functional simulator specification -gpgpu_ptx_instruction_classification 0 -gpgpu_ptx_sim_mode 0 diff -r ea23ae7dc3ff -r f3b35cc4f72d configs/gpu_protocol/VI_hammer_bcu_fusion.py --- a/configs/gpu_protocol/VI_hammer_bcu_fusion.py Wed Dec 02 17:08:49 2015 -0600 +++ b/configs/gpu_protocol/VI_hammer_bcu_fusion.py Wed Dec 02 17:08:49 2015 -0600 @@ -182,7 +182,7 @@ assoc = 16, # 64 is fully associative @ 8kB replacement_policy = "LRU", start_index_bit = block_size_bits, - latency = 8, + latency = options.pwc_latency, resourceStalls = False) # Small cache since CPU L1 requires I and D pwi_cache = L1Cache(size = "512B", @@ -198,13 +198,17 @@ start_index_bit = block_size_bits, latency = 1, resourceStalls = False) + if (options.pw_l2_latency == 0): + m_issue_latency = l1_to_l2_noc_latency + else: + m_issue_latency = options.pw_l2_latency l1_cntrl = L1Cache_Controller(version = options.num_cpus, L1Icache = pwi_cache, L1Dcache = pwd_cache, L2cache = l2_cache, send_evictions = False, - issue_latency = l1_to_l2_noc_latency, + issue_latency = m_issue_latency, #this should be pwc_latency cache_response_latency = 1, l2_cache_hit_latency = 1, number_of_TBEs = options.gpu_l1_buf_depth, diff -r ea23ae7dc3ff -r f3b35cc4f72d configs/gpu_protocol/VI_hammer_fusion.py --- a/configs/gpu_protocol/VI_hammer_fusion.py Wed Dec 02 17:08:49 2015 -0600 +++ b/configs/gpu_protocol/VI_hammer_fusion.py Wed Dec 02 17:08:49 2015 -0600 @@ -176,20 +176,21 @@ # Pagewalk cache # NOTE: We use a CPU L1 cache controller here. This is to facilatate MMU # cache coherence (as the GPU L1 caches are incoherent without flushes - # The L2 cache is small, and should have minimal affect on the + # The L2 cache is small, and should have minimal affect on the # performance (see Section 6.2 of Power et al. HPCA 2014). pwd_cache = L1Cache(size = options.pwc_size, assoc = 16, # 64 is fully associative @ 8kB replacement_policy = "LRU", start_index_bit = block_size_bits, - latency = 8, + latency = options.pwc_latency, resourceStalls = False) # Small cache since CPU L1 requires I and D pwi_cache = L1Cache(size = "512B", assoc = 2, replacement_policy = "LRU", start_index_bit = block_size_bits, - latency = 8, + +latency = 8, resourceStalls = False) # Small cache since CPU L1 controller requires L2 @@ -198,13 +199,17 @@ start_index_bit = block_size_bits, latency = 1, resourceStalls = False) + if (options.pw_l2_latency == 0): + m_issue_latency = l1_to_l2_noc_latency + else: + m_issue_latency = options.pw_l2_latency l1_cntrl = L1Cache_Controller(version = options.num_cpus, L1Icache = pwi_cache, L1Dcache = pwd_cache, L2cache = l2_cache, send_evictions = False, - issue_latency = l1_to_l2_noc_latency, + issue_latency = m_issue_latency, #this should be pwc_latency cache_response_latency = 1, l2_cache_hit_latency = 1, number_of_TBEs = options.gpu_l1_buf_depth, diff -r ea23ae7dc3ff -r f3b35cc4f72d src/gpu/gpgpu-sim/cuda_gpu.cc --- a/src/gpu/gpgpu-sim/cuda_gpu.cc Wed Dec 02 17:08:49 2015 -0600 +++ b/src/gpu/gpgpu-sim/cuda_gpu.cc Wed Dec 02 17:08:49 2015 -0600 @@ -751,11 +751,18 @@ .name(name() + ".kernels_completed") .desc("Number of kernels completed") ; + + shootdownTimes + .name(name() + ".shootdown_times") + .desc("Times to shootdown") + .init(8) + ; } void CudaGPU::TLBShootdownEvent::process() { DPRINTF(CudaGPU, "Processing shootdown!\n"); + assert(gpu->running); switch(stage) { case Stage::Idle: @@ -765,6 +772,7 @@ } stage = Stage::Pausing; gpu->schedule(this, gpu->nextCycle()); + gpu->shootdownStartTick = curTick(); break; case Stage::Pausing: DPRINTF(CudaGPU, "Shootdown: Flushing cores\n"); @@ -797,6 +805,7 @@ } stage = Stage::Idle; // NO need to schedule anything + gpu->shootdownTimes.sample(curTick()-gpu->shootdownStartTick); break; default: panic("Unexpected current shootdown stage"); diff -r ea23ae7dc3ff -r f3b35cc4f72d src/gpu/gpgpu-sim/cuda_gpu.hh --- a/src/gpu/gpgpu-sim/cuda_gpu.hh Wed Dec 02 17:08:49 2015 -0600 +++ b/src/gpu/gpgpu-sim/cuda_gpu.hh Wed Dec 02 17:08:49 2015 -0600 @@ -347,6 +347,7 @@ bool testShootdown; Tick shootdownTick; + Tick shootdownStartTick; public: /// Constructor @@ -485,6 +486,7 @@ /// Statistics for this GPU Stats::Scalar numKernelsCompleted; + Stats::Histogram shootdownTimes; void regStats(); }; diff -r ea23ae7dc3ff -r f3b35cc4f72d src/gpu/shader_tlb.cc --- a/src/gpu/shader_tlb.cc Wed Dec 02 17:08:49 2015 -0600 +++ b/src/gpu/shader_tlb.cc Wed Dec 02 17:08:49 2015 -0600 @@ -197,8 +197,8 @@ return; } int way = (vpn / TheISA::PageBytes) % ways; - GPUTlbEntry* entry = NULL; - Tick minTick = curTick(); + GPUTlbEntry* entry = &entries[way][0]; + Tick minTick = entries[way][0].mruTick; for (int i=0; i < sets; i++) { if (entries[way][i].free) { entry = &entries[way][i]; diff -r ea23ae7dc3ff -r f3b35cc4f72d src/mem/protocol/MOESI_hammer-GPUcache.sm --- a/src/mem/protocol/MOESI_hammer-GPUcache.sm Wed Dec 02 17:08:49 2015 -0600 +++ b/src/mem/protocol/MOESI_hammer-GPUcache.sm Wed Dec 02 17:08:49 2015 -0600 @@ -155,6 +155,7 @@ DataBlock DataBlk, desc="data for the block"; bool FromL2, default="false", desc="block just moved from L2"; bool AtomicAccessed, default="false", desc="block just moved from L2"; + Address VAddr, desc="Virtual address associated with block"; } // TBE fields @@ -167,6 +168,7 @@ bool AppliedSilentAcks, default="false", desc="for full-bit dir, does the pending msg count reflect the silent acks"; MachineID LastResponder, desc="last machine to send a response for this request"; MachineID CurOwner, desc="current owner of the block, used for UnblockS responses"; + Address VAddr, desc="Virtual address associated with block"; Cycles InitialRequestTime, default="Cycles(0)", desc="time the initial requests was sent from the L1Cache"; @@ -977,6 +979,7 @@ tbe.DataBlk := cache_entry.DataBlk; // Data only used for writebacks tbe.Dirty := cache_entry.Dirty; tbe.Sharers := false; + tbe.VAddr := cache_entry.VAddr; } action(it_allocateTBE, "it", desc="Allocate TBE") { @@ -1004,6 +1007,7 @@ assert(is_valid(tbe)); cache_entry.Dirty := tbe.Dirty; cache_entry.DataBlk := tbe.DataBlk; + cache_entry.VAddr := tbe.VAddr; } action(nb_copyFromTBEToL1, "fu", desc="Copy data from TBE to L1 cache entry.") { @@ -1012,6 +1016,7 @@ cache_entry.Dirty := tbe.Dirty; cache_entry.DataBlk := tbe.DataBlk; cache_entry.FromL2 := true; + cache_entry.VAddr := tbe.VAddr; } action(m_decrementNumberOfMessages, "m", desc="Decrement the number of messages for which we're waiting") { @@ -1102,6 +1107,7 @@ DPRINTF(RubySlicc, "%s\n", out_msg.Destination); out_msg.DataBlk := tbe.DataBlk; out_msg.Dirty := tbe.Dirty; + out_msg.VAddr := tbe.VAddr; if (in_msg.DirectedProbe) { out_msg.Acks := machineCount(MachineType:L1Cache); } else { @@ -1128,6 +1134,7 @@ DPRINTF(RubySlicc, "%s\n", out_msg.Destination); out_msg.DataBlk := tbe.DataBlk; out_msg.Dirty := tbe.Dirty; + out_msg.VAddr := tbe.VAddr; if (in_msg.DirectedProbe) { out_msg.Acks := machineCount(MachineType:L1Cache); } else { @@ -1153,6 +1160,7 @@ DPRINTF(RubySlicc, "%s\n", out_msg.Destination); out_msg.DataBlk := tbe.DataBlk; out_msg.Dirty := tbe.Dirty; + out_msg.VAddr := tbe.VAddr; out_msg.Acks := machineCount(MachineType:L1Cache); out_msg.SilentAcks := in_msg.SilentAcks; out_msg.MessageSize := MessageSizeType:Response_Data; @@ -1174,6 +1182,7 @@ } out_msg.OriginalDestination.add(map_Address_to_Directory(address)); out_msg.Dirty := tbe.Dirty; + out_msg.VAddr := tbe.VAddr; if (tbe.Dirty) { out_msg.Type := CoherenceResponseType:WB_DIRTY; out_msg.DataBlk := tbe.DataBlk; @@ -1206,6 +1215,7 @@ out_msg.Destination.add(map_Address_to_Directory(address)); out_msg.DataBlk := tbe.DataBlk; out_msg.Dirty := tbe.Dirty; + out_msg.VAddr := tbe.VAddr; if (tbe.Dirty) { out_msg.Type := CoherenceResponseType:WB_EXCLUSIVE_DIRTY; out_msg.DataBlk := tbe.DataBlk; @@ -1270,12 +1280,18 @@ action(ii_allocateL1DCacheBlock, "\i", desc="Set L1 D-cache tag equal to tag of block B.") { if (is_invalid(cache_entry)) { set_cache_entry(L1Dcache.allocate(address, new Entry)); + peek(mandatoryQueue_in, RubyRequest){ + cache_entry.VAddr := in_msg.VirtualAddress; + } } } action(jj_allocateL1ICacheBlock, "\j", desc="Set L1 I-cache tag equal to tag of block B.") { if (is_invalid(cache_entry)) { set_cache_entry(L1Icache.allocate(address, new Entry)); + peek(mandatoryQueue_in, RubyRequest) { + cache_entry.VAddr := in_msg.VirtualAddress; + } } } diff -r ea23ae7dc3ff -r f3b35cc4f72d src/mem/protocol/MOESI_hammer_bcu-BCU.sm --- a/src/mem/protocol/MOESI_hammer_bcu-BCU.sm Wed Dec 02 17:08:49 2015 -0600 +++ b/src/mem/protocol/MOESI_hammer_bcu-BCU.sm Wed Dec 02 17:08:49 2015 -0600 @@ -34,6 +34,9 @@ structure (PermissionTable, external="yes") { void logAddress(Address, bool); + void logAddressRead(Address); + void logAddressData(Address); + void logAddressCoherence(Address); bool checkPLB(Address, bool); bool checkTable(Address, bool); } @@ -131,6 +134,7 @@ /*Do we need read permission*/ if (in_msg.Type == CoherenceResponseType:ACK_SHARED){ perm_table.logAddress(in_msg.Addr, false); + perm_table.logAddressCoherence(in_msg.Addr); bool ret := perm_table.checkPLB(in_msg.Addr, false); if (ret != true) { perm_table.checkTable(in_msg.Addr, false); @@ -144,7 +148,8 @@ /*With this coherence protocol, hard to tell if this was an innocent read that got O or what. Use dirty bit.*/ perm_table.logAddress(in_msg.Addr, true); - bool ret := perm_table.checkPLB(in_msg.Addr, in_msg.Dirty); + perm_table.logAddressData(in_msg.Addr); + bool ret := perm_table.checkPLB(in_msg.Addr, true); if (ret != true) { perm_table.checkTable(in_msg.Addr, true); latency := miss_latency; @@ -179,6 +184,7 @@ in_msg.Type == CoherenceResponseType:UNBLOCKM || in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_CLEAN){ perm_table.logAddress(in_msg.Addr, false); + perm_table.logAddressCoherence(in_msg.Addr); bool ret := perm_table.checkPLB(in_msg.Addr, false); if (ret != true) { perm_table.checkTable(in_msg.Addr, false); @@ -189,6 +195,7 @@ else if (in_msg.Type == CoherenceResponseType:WB_DIRTY || in_msg.Type == CoherenceResponseType:WB_EXCLUSIVE_DIRTY){ perm_table.logAddress(in_msg.Addr, true); + perm_table.logAddressData(in_msg.Addr); bool ret := perm_table.checkPLB(in_msg.Addr, true); if (ret != true) { perm_table.checkTable(in_msg.Addr, true); @@ -215,29 +222,31 @@ action(rqc_reqtodir, "rqc", desc="RequestFromCache") { peek(requestFromCache_in, RequestMsg) { DPRINTF(RubySlicc, "Got req to addr %s\n", in_msg.Addr); - Cycles latency := hit_latency; - /*Do we need read permission*/ - if (in_msg.Type == CoherenceRequestType:GETX || - in_msg.Type == CoherenceRequestType:GETS || - in_msg.Type == CoherenceRequestType:MERGED_GETS || - in_msg.Type == CoherenceRequestType:GETF){ - perm_table.logAddress(in_msg.Addr, false); - bool ret := perm_table.checkPLB(in_msg.Addr, false); - if (ret != true) { - perm_table.checkTable(in_msg.Addr, false); - latency := miss_latency; + Cycles latency := hit_latency; + /*Do we need read permission*/ + if (in_msg.Type == CoherenceRequestType:GETX || + in_msg.Type == CoherenceRequestType:GETS || + in_msg.Type == CoherenceRequestType:MERGED_GETS || + in_msg.Type == CoherenceRequestType:GETF){ + perm_table.logAddress(in_msg.Addr, false); + perm_table.logAddressRead(in_msg.Addr); + bool ret := perm_table.checkPLB(in_msg.Addr, false); + if (ret != true) { + perm_table.checkTable(in_msg.Addr, false); + latency := miss_latency; + } } - } - /*Do we need write permission*/ - else if (in_msg.Type == CoherenceRequestType:PUT || - in_msg.Type == CoherenceRequestType:PUTF){ - perm_table.logAddress(in_msg.Addr, true); - bool ret := perm_table.checkPLB(in_msg.Addr, true); - if (ret != true) { - perm_table.checkTable(in_msg.Addr, true); - latency := miss_latency; + /*Do we need write permission*/ + else if (in_msg.Type == CoherenceRequestType:PUT || + in_msg.Type == CoherenceRequestType:PUTF){ + perm_table.logAddress(in_msg.Addr, true); + perm_table.logAddressCoherence(in_msg.Addr); + bool ret := perm_table.checkPLB(in_msg.Addr, true); + if (ret != true) { + perm_table.checkTable(in_msg.Addr, true); + latency := miss_latency; + } } - } enqueue(requestNetwork_out, RequestMsg, latency) { out_msg.Addr := in_msg.Addr; out_msg.Type := in_msg.Type; diff -r ea23ae7dc3ff -r f3b35cc4f72d src/mem/protocol/MOESI_hammer_bcu-msg.sm --- a/src/mem/protocol/MOESI_hammer_bcu-msg.sm Wed Dec 02 17:08:49 2015 -0600 +++ b/src/mem/protocol/MOESI_hammer_bcu-msg.sm Wed Dec 02 17:08:49 2015 -0600 @@ -95,6 +95,7 @@ NetDest OriginalDestination, desc="Multicast destination mask"; MessageSizeType MessageSize, desc="size category of the message"; bool DirectedProbe, default="false", desc="probe filter directed probe"; + Address VAddr, desc="Virtual address for this request"; Cycles InitialRequestTime, default="Cycles(0)", desc="time the initial requests was sent from the L1Cache"; @@ -125,6 +126,7 @@ bool Dirty, desc="Is the data dirty (different than memory)?"; int Acks, default="0", desc="How many messages this counts as"; MessageSizeType MessageSize, desc="size category of the message"; + Address VAddr, desc="Virtual address for this request"; Cycles InitialRequestTime, default="Cycles(0)", desc="time the initial requests was sent from the L1Cache"; diff -r ea23ae7dc3ff -r f3b35cc4f72d src/mem/ruby/PermissionTable.cc --- a/src/mem/ruby/PermissionTable.cc Wed Dec 02 17:08:49 2015 -0600 +++ b/src/mem/ruby/PermissionTable.cc Wed Dec 02 17:08:49 2015 -0600 @@ -55,6 +55,22 @@ DPRINTF(PermissionTable, "Addr %#x, %s\n", addr.getAddress(), isWrite ? "W" : "R"); } +void PermissionTable::logAddressData(Address addr) +{ + m_data_message++; +} + + +void PermissionTable::logAddressRead(Address addr) +{ + m_read_message++; +} + +void PermissionTable::logAddressCoherence(Address addr) +{ + m_coherence_message++; +} + /* After ATS translates address, insert it into permission table & cache. * Should only be called from ATS (shaderMMU) * Returns true if anything was inserted, since then a memory request @@ -74,10 +90,8 @@ //not in PLB - insert it hasWrite ? m_plb_write_insert_miss++ : m_plb_read_insert_miss++; if (plb_size > 0){ - //update with this entry - entry.first = plb_tag; - //initialize vector - entry.second = std::vector(plb_entry_size * 2, false); + m_table_read++; + entry = getTableBlock(plb_tag); entry.second[offset] = true; entry.second[offset + 1] = hasWrite; @@ -144,6 +158,20 @@ return true; } +/* Flush the PermissionTable and PLB */ +void PermissionTable::flush(){ + m_plb_flushes++; + + //Clear the table + std::fill(permission_bitmap.begin(), permission_bitmap.end(), false); + + //Clear the PLB + m_plb_flushed_blocks += plb.size(); + plb.clear(); + + +} + /*Helper functions for dealing with PLB*/ /* requires pre-shifted address (tag) */ PermissionTable::plb_entry PermissionTable::popPLBEntry(Address plb_tag){ @@ -324,6 +352,31 @@ .name(name() + ".table_write") .desc("Number of permission table writes") ; + + m_data_message + .name(name() + ".data_message") + .desc("Number of messages carrying WB data") + ; + + m_read_message + .name(name() + ".read_message") + .desc("Number of messages carrying requests for data") + ; + + m_coherence_message + .name(name() + ".coherence_message") + .desc("Number of other coherence messages") + ; + + m_plb_flushes + .name(name() + ".num_plb_flushes") + .desc("Number of PLB flushes") + ; + + m_plb_flushed_blocks + .name(name() + ".num_plb_flushed_blocks") + .desc("Number of flushed PLB blocks") + ; } PermissionTable * diff -r ea23ae7dc3ff -r f3b35cc4f72d src/mem/ruby/PermissionTable.hh --- a/src/mem/ruby/PermissionTable.hh Wed Dec 02 17:08:49 2015 -0600 +++ b/src/mem/ruby/PermissionTable.hh Wed Dec 02 17:08:49 2015 -0600 @@ -48,11 +48,16 @@ PermissionTable(const Params *p); void logAddress(Address addr, bool isWrite); + void logAddressData(Address addr); + void logAddressRead(Address addr); + void logAddressCoherence(Address addr); bool insert(Addr page_addr, bool hasWrite); bool checkPLB(Address addr, bool isWriteback); + void flush(); + bool checkTable(Address addr, bool isWriteback); // What 64-byte block is this address found in? @@ -115,6 +120,13 @@ Stats::Scalar m_table_read; Stats::Scalar m_table_write; + Stats::Scalar m_data_message; + Stats::Scalar m_read_message; + Stats::Scalar m_coherence_message; + + Stats::Scalar m_plb_flushes; + Stats::Scalar m_plb_flushed_blocks; + }; #endif // __PERMISSION_TABLE_HH__