# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098773 21600
# Node ID 20dc45bc1490d609185c620cc5e8db2bce89b074
# Parent  3a87241adfb8c993f4ba2671ae6cc7082743ee71
Configs: Update checkpoint logic to fix taking and restoring checkpoints at the same time

diff -r 3a87241adfb8 -r 20dc45bc1490 configs/common/Simulation.py
--- a/configs/common/Simulation.py	Sat Oct 11 16:18:51 2014 -0500
+++ b/configs/common/Simulation.py	Wed Dec 02 17:26:13 2015 -0600
@@ -485,7 +485,8 @@
     # option only for finding the checkpoints to restore from.  This
     # lets us test checkpointing by restoring from one set of
     # checkpoints, generating a second set, and then comparing them.
-    if options.take_checkpoints and options.checkpoint_restore:
+    if (options.take_checkpoints or options.checkpoint_at_end) and \
+            options.checkpoint_restore:
         if m5.options.outdir:
             cptdir = m5.options.outdir
         else:
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098817 21600
# Node ID 157e8df00aa5075936b75f412529b00aed4ee7dd
# Parent  20dc45bc1490d609185c620cc5e8db2bce89b074
Mem: Require flush requests to have a response
* * *
Ruby: Fix flush response request leak

The flush-response patch makes the FlushReq MemCmd require a response, which
causes requests to not be deleted in the packet destructor during cooldown of
the caches before checkpointing. Requests are 72B, so larger systems with
reasonably large caches leak a lot of memory for each checkpoint taken.

NOTE: This change will be folded into the jason/flush-responses patch.

diff -r 20dc45bc1490 -r 157e8df00aa5 src/mem/packet.cc
--- a/src/mem/packet.cc	Wed Dec 02 17:26:13 2015 -0600
+++ b/src/mem/packet.cc	Wed Dec 02 17:26:57 2015 -0600
@@ -166,7 +166,11 @@
     /* PrintReq */
     { SET2(IsRequest, IsPrint), InvalidCmd, "PrintReq" },
     /* Flush Request */
-    { SET3(IsRequest, IsFlush, NeedsExclusive), InvalidCmd, "FlushReq" },
+    { SET4(IsRequest, IsFlush, NeedsExclusive, NeedsResponse), FlushResp,
+           "FlushReq" },
+    /* Flush Response */
+    { SET3(IsResponse, IsFlush, NeedsExclusive), InvalidCmd,
+           "FlushResp" },
     /* Invalidation Request */
     { SET3(NeedsExclusive, IsInvalidate, IsRequest),
       InvalidCmd, "InvalidationReq" },
diff -r 20dc45bc1490 -r 157e8df00aa5 src/mem/packet.hh
--- a/src/mem/packet.hh	Wed Dec 02 17:26:13 2015 -0600
+++ b/src/mem/packet.hh	Wed Dec 02 17:26:57 2015 -0600
@@ -119,6 +119,7 @@
         // Fake simulator-only commands
         PrintReq,       // Print state matching address
         FlushReq,      //request for a cache flush
+        FlushResp,
         InvalidationReq,   // request for address to be invalidated from lsq
         NUM_MEM_CMDS
     };
diff -r 20dc45bc1490 -r 157e8df00aa5 src/mem/ruby/system/Sequencer.cc
--- a/src/mem/ruby/system/Sequencer.cc	Wed Dec 02 17:26:13 2015 -0600
+++ b/src/mem/ruby/system/Sequencer.cc	Wed Dec 02 17:26:57 2015 -0600
@@ -568,6 +568,8 @@
         delete pkt;
         g_system_ptr->m_cache_recorder->enqueueNextFetchRequest();
     } else if (g_system_ptr->m_cooldown_enabled) {
+        assert(pkt->req);
+        delete pkt->req;
         delete pkt;
         g_system_ptr->m_cache_recorder->enqueueNextFlushRequest();
     } else {
# HG changeset patch
# User Joel Hestness <hestness@cs.utexas.edu>
# Date 1449098817 21600
# Node ID a9b64c93ebd89aeea0a70ad7e02dd32827015cb4
# Parent  157e8df00aa5075936b75f412529b00aed4ee7dd
Add the x86 magic instruction to do a callback into gem5 when a gpu call
is made by the application that is being run
* * *
Fixed M5 magic instruction m5_gpu to be serializing
* * *
ARM ISA: Add m5_gpu magic instruction

Submitter: Jieming Yin <bjm419@gmail.com>

In order to add gem5-gpu ARM support, the ARM ISA needs to include the m5_gpu
magic instruction. This patch adds that instruction including passing memory
addresses as 64-bit rather than 32-bit as the underlying architecture.
* * *
m5ops: Add ARM32 m5_gpu interface function

Submitter: Jieming Yin <bjm419@gmail.com>

Add the m5 utils function to intercept m5_gpu calls under ARM32. This is a
required step before building ARM32 CPU functionality for gem5-gpu. NOTE:
other patches are required for the ARM decoder to know how to handle m5_gpu
as a pseudo-instruction.

diff -r 157e8df00aa5 -r a9b64c93ebd8 src/arch/arm/isa/formats/m5ops.isa
--- a/src/arch/arm/isa/formats/m5ops.isa	Wed Dec 02 17:26:57 2015 -0600
+++ b/src/arch/arm/isa/formats/m5ops.isa	Wed Dec 02 17:26:57 2015 -0600
@@ -68,6 +68,7 @@
             case 0x54: return new M5panic(machInst);
             case 0x5a: return new M5workbegin(machInst);
             case 0x5b: return new M5workend(machInst);
+            case 0x5c: return new M5gpu(machInst);
         }
    }
    '''
diff -r 157e8df00aa5 -r a9b64c93ebd8 src/arch/arm/isa/insts/m5ops.isa
--- a/src/arch/arm/isa/insts/m5ops.isa	Wed Dec 02 17:26:57 2015 -0600
+++ b/src/arch/arm/isa/insts/m5ops.isa	Wed Dec 02 17:26:57 2015 -0600
@@ -563,4 +563,18 @@
     header_output += BasicDeclare.subst(m5workendIop)
     decoder_output += BasicConstructor.subst(m5workendIop)
     exec_output += PredOpExecute.subst(m5workendIop)
+
+    m5gpuCode = '''PseudoInst::gpu(
+                        xc->tcBase(),
+                        join32to64(R1, R0),
+                        join32to64(R3, R2)
+                    );'''
+    m5gpuIop = InstObjParams("m5gpu", "M5gpu", "PredOp",
+                     { "code": m5gpuCode,
+                       "predicate_test": predicateTest },
+                       ["IsNonSpeculative", "IsSerializeAfter"])
+    header_output += BasicDeclare.subst(m5gpuIop)
+    decoder_output += BasicConstructor.subst(m5gpuIop)
+    exec_output += PredOpExecute.subst(m5gpuIop)
+
 }};
diff -r 157e8df00aa5 -r a9b64c93ebd8 src/arch/x86/isa/decoder/two_byte_opcodes.isa
--- a/src/arch/x86/isa/decoder/two_byte_opcodes.isa	Wed Dec 02 17:26:57 2015 -0600
+++ b/src/arch/x86/isa/decoder/two_byte_opcodes.isa	Wed Dec 02 17:26:57 2015 -0600
@@ -216,6 +216,9 @@
                         0x5b: m5_work_end({{
                             PseudoInst::workend(xc->tcBase(), Rdi, Rsi);
                         }}, IsNonSpeculative);
+                        0x5c: m5_gpu({{
+                            PseudoInst::gpu(xc->tcBase(), Rdi, Rsi);
+                        }}, IsNonSpeculative, IsSerializeAfter);
                         default: Inst::UD2();
                     }
                 }
diff -r 157e8df00aa5 -r a9b64c93ebd8 src/sim/pseudo_inst.cc
--- a/src/sim/pseudo_inst.cc	Wed Dec 02 17:26:57 2015 -0600
+++ b/src/sim/pseudo_inst.cc	Wed Dec 02 17:26:57 2015 -0600
@@ -706,4 +706,10 @@
     }
 }
 
+void
+gpu(ThreadContext *tc, uint64_t param1, uint64_t param2)
+{
+    panic("gpu pseudo instruction not yet defined");
+}
+
 } // namespace PseudoInst
diff -r 157e8df00aa5 -r a9b64c93ebd8 src/sim/pseudo_inst.hh
--- a/src/sim/pseudo_inst.hh	Wed Dec 02 17:26:57 2015 -0600
+++ b/src/sim/pseudo_inst.hh	Wed Dec 02 17:26:57 2015 -0600
@@ -88,6 +88,7 @@
 void switchcpu(ThreadContext *tc);
 void workbegin(ThreadContext *tc, uint64_t workid, uint64_t threadid);
 void workend(ThreadContext *tc, uint64_t workid, uint64_t threadid);
+void gpu(ThreadContext *tc, uint64_t param1, uint64_t param2);
 
 } // namespace PseudoInst
 
diff -r 157e8df00aa5 -r a9b64c93ebd8 util/m5/Makefile.x86
--- a/util/m5/Makefile.x86	Wed Dec 02 17:26:57 2015 -0600
+++ b/util/m5/Makefile.x86	Wed Dec 02 17:26:57 2015 -0600
@@ -32,6 +32,7 @@
 LD=ld
 
 CFLAGS=-O2 -DM5OP_ADDR=0xFFFF0000
+LDFLAGS=-static
 OBJS=m5.o m5op_x86.o
 
 all: m5
@@ -43,7 +44,7 @@
 	$(CC)  $(CFLAGS) -o $@ -c $<
 
 m5: $(OBJS)
-	$(CC) -o $@ $(OBJS)
+	$(CC) $(LDFLAGS) -o $@ $(OBJS)
 
 clean:
 	rm -f *.o m5
diff -r 157e8df00aa5 -r a9b64c93ebd8 util/m5/m5.c
--- a/util/m5/m5.c	Wed Dec 02 17:26:57 2015 -0600
+++ b/util/m5/m5.c	Wed Dec 02 17:26:57 2015 -0600
@@ -101,32 +101,32 @@
     }
 }
 
-int
-write_file(const char *filename)
-{
-    fprintf(stderr, "opening %s\n", filename);
-    int src_fid = open(filename, O_RDONLY);
-
-    if (src_fid < 0) {
-        fprintf(stderr, "error opening %s\n", filename);
-        return;
-    }
-
-    char buf[256*1024];
-    int offset = 0;
-    int len;
-    int bytes = 0;
-
-    memset(buf, 0, sizeof(buf));
-
-    while ((len = read(src_fid, buf, sizeof(buf))) > 0) {
-        bytes += m5_writefile(buf, len, offset, filename);
-        offset += len;
-    }
-    fprintf(stderr, "written %d bytes\n", bytes);
-
-    close(src_fid);
-}
+//int
+//write_file(const char *filename)
+//{
+//    fprintf(stderr, "opening %s\n", filename);
+//    int src_fid = open(filename, O_RDONLY);
+//
+//    if (src_fid < 0) {
+//        fprintf(stderr, "error opening %s\n", filename);
+//        return;
+//    }
+//
+//    char buf[256*1024];
+//    int offset = 0;
+//    int len;
+//    int bytes = 0;
+//
+//    memset(buf, 0, sizeof(buf));
+//
+//    while ((len = read(src_fid, buf, sizeof(buf))) > 0) {
+//        bytes += m5_writefile(buf, len, offset, filename);
+//        offset += len;
+//    }
+//    fprintf(stderr, "written %d bytes\n", bytes);
+//
+//    close(src_fid);
+//}
 
 void
 do_exit(int argc, char *argv[])
@@ -183,16 +183,16 @@
     read_file(STDOUT_FILENO);
 }
 
-void
-do_write_file(int argc, char *argv[])
-{
-    if (argc != 1)
-        usage();
-
-    const char *filename = argv[0];
-
-    write_file(filename);
-}
+//void
+//do_write_file(int argc, char *argv[])
+//{
+//    if (argc != 1)
+//        usage();
+//
+//    const char *filename = argv[0];
+//
+//    write_file(filename);
+//}
 
 void
 do_exec_file(int argc, char *argv[])
@@ -255,6 +255,16 @@
            (param >> 12) & 0xfff, (param >> 0) & 0xfff);
 }
 
+void
+do_gpu(int argc, char *argv[])
+{
+    if (argc != 0)
+        usage();
+
+    // @TODO: Figure out params that need to be passed
+    m5_gpu();
+}
+
 #ifdef linux
 void
 do_pin(int argc, char *argv[])
@@ -294,12 +304,13 @@
     { "dumpstats",      do_dump_stats,       "[delay [period]]" },
     { "dumpresetstats", do_dump_reset_stats, "[delay [period]]" },
     { "readfile",       do_read_file,        "" },
-    { "writefile",      do_write_file,       "<filename>" },
+//    { "writefile",      do_write_file,       "<filename>" },
     { "execfile",       do_exec_file,        "" },
     { "checkpoint",     do_checkpoint,       "[delay [period]]" },
     { "loadsymbol",     do_load_symbol,      "<address> <symbol>" },
     { "initparam",      do_initparam,        "" },
     { "sw99param",      do_sw99param,        "" },
+    { "gpu",            do_gpu,              "" },
 #ifdef linux
     { "pin",            do_pin,              "<cpu> <program> [args ...]" }
 #endif
diff -r 157e8df00aa5 -r a9b64c93ebd8 util/m5/m5op.h
--- a/util/m5/m5op.h	Wed Dec 02 17:26:57 2015 -0600
+++ b/util/m5/m5op.h	Wed Dec 02 17:26:57 2015 -0600
@@ -54,13 +54,14 @@
 void m5_dump_stats(uint64_t ns_delay, uint64_t ns_period);
 void m5_dumpreset_stats(uint64_t ns_delay, uint64_t ns_period);
 uint64_t m5_readfile(void *buffer, uint64_t len, uint64_t offset);
-uint64_t m5_writefile(void *buffer, uint64_t len, uint64_t offset, const char *filename);
+//uint64_t m5_writefile(void *buffer, uint64_t len, uint64_t offset, const char *filename);
 void m5_debugbreak(void);
 void m5_switchcpu(void);
 void m5_addsymbol(uint64_t addr, char *symbol);
 void m5_panic(void);
 void m5_work_begin(uint64_t workid, uint64_t threadid);
 void m5_work_end(uint64_t workid, uint64_t threadid);
+void m5_gpu();
 
 // These operations are for critical path annotation
 void m5a_bsm(char *sm, const void *id, int flags);
diff -r 157e8df00aa5 -r a9b64c93ebd8 util/m5/m5op_arm.S
--- a/util/m5/m5op_arm.S	Wed Dec 02 17:26:57 2015 -0600
+++ b/util/m5/m5op_arm.S	Wed Dec 02 17:26:57 2015 -0600
@@ -89,6 +89,7 @@
 SIMPLE_OP(m5_panic, panic_func, 0)
 SIMPLE_OP(m5_work_begin, work_begin_func, 0)
 SIMPLE_OP(m5_work_end, work_end_func, 0)
+SIMPLE_OP(m5_gpu, gpu_func, 0)
 
 SIMPLE_OP(m5a_bsm, annotate_func, an_bsm)
 SIMPLE_OP(m5a_esm, annotate_func, an_esm)
diff -r 157e8df00aa5 -r a9b64c93ebd8 util/m5/m5op_x86.S
--- a/util/m5/m5op_x86.S	Wed Dec 02 17:26:57 2015 -0600
+++ b/util/m5/m5op_x86.S	Wed Dec 02 17:26:57 2015 -0600
@@ -83,3 +83,4 @@
 TWO_BYTE_OP(m5_panic, panic_func)
 TWO_BYTE_OP(m5_work_begin, work_begin_func)
 TWO_BYTE_OP(m5_work_end, work_end_func)
+TWO_BYTE_OP(m5_gpu, gpu_func)
diff -r 157e8df00aa5 -r a9b64c93ebd8 util/m5/m5ops.h
--- a/util/m5/m5ops.h	Wed Dec 02 17:26:57 2015 -0600
+++ b/util/m5/m5ops.h	Wed Dec 02 17:26:57 2015 -0600
@@ -61,6 +61,7 @@
 
 #define work_begin_func          0x5a
 #define work_end_func            0x5b
+#define gpu_func                 0x5c
 
 // These operations are for critical path annotation
 #define annotate_func     0x55
# HG changeset patch
# User Joel Hestness <hestness@cs.utexas.edu>
# Date 1449098817 21600
# Node ID a0e6ac8d1c02bd0ba50992af8e0d63797257ee26
# Parent  a9b64c93ebd89aeea0a70ad7e02dd32827015cb4
Setup gem5 magic instructions to handle CUDA calls and build out the
coordination with an updated version of libcuda
* * *
This patch should be folded into the full-system GPGPU-Sim functionality patch
after appropriate implementation of sending GPU ST data into Ruby through
write requests.
* * *
This patch should be folded into the full-system GPGPU-Sim functionality patch
after appropriate implementation of requesting GPU LD data from Ruby through
read requests.
* * *
Fold into the gpgpu-sim glue patch
* * *
Updating to apply patches to changeset 8929: Update configuration files
to use the new option parser organization
* * *
Changes to the stream processor array to be folded into fs_functionality
* * *
Merge the instruction memory access in GPGPU-Sim into the gem5 memory
hierarchy
* * *
Fold into merge_inst_memory patch
* * *
Fixup uninitialized values in SP array
* * *
Fix the issue of multiple buffered writes to the same data. Ordering
is defined as last received by buffer will be written to line
* * *
Add GPU syscall for registering device memory
* * *
GPU Magic Instruction: Modify header for pointer handling

For the inclusion of ARM 32-bit, we will want to pass a single pointer
type to gem5-gpu through the m5_gpu pseudo-instruction. To avoid separate
handling for 32- vs. 64-bit architectures, use a uint64_t in the m5op header
rather than the gpusyscall_t* pointer, which can have different sizes.

NOTE: To use this updated function header will require the updated version
of libcuda, though this change is backward compatible to allow old binaries
to still execute correctly

diff -r a9b64c93ebd8 -r a0e6ac8d1c02 src/sim/pseudo_inst.cc
--- a/src/sim/pseudo_inst.cc	Wed Dec 02 17:26:57 2015 -0600
+++ b/src/sim/pseudo_inst.cc	Wed Dec 02 17:26:57 2015 -0600
@@ -73,6 +73,12 @@
 #include "sim/system.hh"
 #include "sim/vptr.hh"
 
+//#include "../../gpgpu-sim/src/gem5/gpu_syscalls.hh"
+struct gpusyscall;
+typedef struct gpusyscall gpusyscall_t;
+typedef uint64_t (*cudaFunc_t)(ThreadContext *, gpusyscall_t *);
+extern cudaFunc_t gpgpu_funcs[];
+
 using namespace std;
 
 using namespace Stats;
@@ -707,9 +713,14 @@
 }
 
 void
-gpu(ThreadContext *tc, uint64_t param1, uint64_t param2)
+gpu(ThreadContext *tc, uint64_t gpusysno, uint64_t call_params)
 {
-    panic("gpu pseudo instruction not yet defined");
+    if (gpusysno > 83) {
+        warn("Ignoring gpu syscall %d\n", gpusysno);
+        return;
+    }
+
+    gpgpu_funcs[gpusysno](tc, (gpusyscall_t*)call_params);
 }
 
 } // namespace PseudoInst
diff -r a9b64c93ebd8 -r a0e6ac8d1c02 src/sim/pseudo_inst.hh
--- a/src/sim/pseudo_inst.hh	Wed Dec 02 17:26:57 2015 -0600
+++ b/src/sim/pseudo_inst.hh	Wed Dec 02 17:26:57 2015 -0600
@@ -88,7 +88,7 @@
 void switchcpu(ThreadContext *tc);
 void workbegin(ThreadContext *tc, uint64_t workid, uint64_t threadid);
 void workend(ThreadContext *tc, uint64_t workid, uint64_t threadid);
-void gpu(ThreadContext *tc, uint64_t param1, uint64_t param2);
+void gpu(ThreadContext *tc, uint64_t gpusysno, uint64_t call_params);
 
 } // namespace PseudoInst
 
diff -r a9b64c93ebd8 -r a0e6ac8d1c02 util/m5/m5.c
--- a/util/m5/m5.c	Wed Dec 02 17:26:57 2015 -0600
+++ b/util/m5/m5.c	Wed Dec 02 17:26:57 2015 -0600
@@ -258,11 +258,12 @@
 void
 do_gpu(int argc, char *argv[])
 {
-    if (argc != 0)
+    if (argc < 1)
         usage();
 
-    // @TODO: Figure out params that need to be passed
-    m5_gpu();
+    uint64_t* callno = (uint64_t*)argv[0];
+
+    m5_gpu(*callno, NULL);
 }
 
 #ifdef linux
diff -r a9b64c93ebd8 -r a0e6ac8d1c02 util/m5/m5op.h
--- a/util/m5/m5op.h	Wed Dec 02 17:26:57 2015 -0600
+++ b/util/m5/m5op.h	Wed Dec 02 17:26:57 2015 -0600
@@ -61,7 +61,7 @@
 void m5_panic(void);
 void m5_work_begin(uint64_t workid, uint64_t threadid);
 void m5_work_end(uint64_t workid, uint64_t threadid);
-void m5_gpu();
+void m5_gpu(uint64_t __gpusysno, uint64_t call_params);
 
 // These operations are for critical path annotation
 void m5a_bsm(char *sm, const void *id, int flags);
# HG changeset patch
# User Joel Hestness <jthestness@gmail.com>
# Date 1449098817 21600
# Node ID a3b87764d17ed733620870e4374f892b4212dfe3
# Parent  a0e6ac8d1c02bd0ba50992af8e0d63797257ee26
Regressions: Add gem5 bits and pieces for gem5-gpu regressions

To run gem5-style regressions on gem5-gpu requires updating a few parts of the
gem5 regress infrastructure. Specifically, (1) add gem5-gpu as a test type in
the tests SConscript, (2) add the appropriate GPU stats in diff-out to make
sure we check them, and (3) add symlinks to the regression config scripts.

Notes: To run these tests requires two gem5-gpu patches that include the other
scripts and initial set of regression tests.

diff -r a0e6ac8d1c02 -r a3b87764d17e tests/SConscript
--- a/tests/SConscript	Wed Dec 02 17:26:57 2015 -0600
+++ b/tests/SConscript	Wed Dec 02 17:26:57 2015 -0600
@@ -347,6 +347,9 @@
             'rubytest', 'memtest', 'memtest-filter',
             'tgen-simple-mem', 'tgen-dram-ctrl']
 
+if env['GPGPU_SIM']:
+    configs.append('gem5-gpu')
+
 if env['PROTOCOL'] != 'None':
     if env['PROTOCOL'] == 'MI_example':
         configs += [c + "-ruby" for c in configs]
diff -r a0e6ac8d1c02 -r a3b87764d17e tests/configs/gem5-gpu-ruby.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/configs/gem5-gpu-ruby.py	Wed Dec 02 17:26:57 2015 -0600
@@ -0,0 +1,1 @@
+../../../gem5-gpu/tests/configs/gem5-gpu-ruby.py
\ No newline at end of file
diff -r a0e6ac8d1c02 -r a3b87764d17e tests/diff-out
--- a/tests/diff-out	Wed Dec 02 17:26:57 2015 -0600
+++ b/tests/diff-out	Wed Dec 02 17:26:57 2015 -0600
@@ -202,7 +202,8 @@
   'sim_ops',
   'sim_ticks',
   'host_inst_rate',
-  'host_mem_usage'
+  'host_mem_usage',
+  'inst_counts'
 );
 
 $key_stat_pattern = join('|', @key_stat_list);
diff -r a0e6ac8d1c02 -r a3b87764d17e tests/quick/se_gpu
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/quick/se_gpu	Wed Dec 02 17:26:57 2015 -0600
@@ -0,0 +1,1 @@
+../../../gem5-gpu/tests/quick/se_gpu/
\ No newline at end of file
# HG changeset patch
# User Joel Hestness <hestness@cs.utexas.edu>
# Date 1449098817 21600
# Node ID 8cf19b2fd1d8ab1d6ca517f86e30dbb312b35834
# Parent  a3b87764d17ed733620870e4374f892b4212dfe3
DirectoryMemory: Fix the number of directory bits calculation

diff -r a3b87764d17e -r 8cf19b2fd1d8 src/mem/ruby/structures/DirectoryMemory.cc
--- a/src/mem/ruby/structures/DirectoryMemory.cc	Wed Dec 02 17:26:57 2015 -0600
+++ b/src/mem/ruby/structures/DirectoryMemory.cc	Wed Dec 02 17:26:57 2015 -0600
@@ -98,8 +98,8 @@
     if (m_num_directories_bits == 0)
         return 0;
 
-    uint64 ret = address.bitSelect(m_numa_high_bit - m_num_directories_bits + 1,
-                                   m_numa_high_bit);
+    uint64 ret = address.shiftLowOrderBits(m_numa_high_bit - m_num_directories_bits + 1) % m_num_directories;
+
     return ret;
 }
 
# HG changeset patch
# User Joel Hestness <hestness@cs.wisc.edu>
# Date 1449098818 21600
# Node ID cee4758c8ec8c8d8c7f5f9af3f6e199bbc6bf683
# Parent  8cf19b2fd1d8ab1d6ca517f86e30dbb312b35834
imported patch common/fix_memory_controller_timings

diff -r 8cf19b2fd1d8 -r cee4758c8ec8 src/mem/ruby/structures/RubyMemoryControl.cc
--- a/src/mem/ruby/structures/RubyMemoryControl.cc	Wed Dec 02 17:26:57 2015 -0600
+++ b/src/mem/ruby/structures/RubyMemoryControl.cc	Wed Dec 02 17:26:58 2015 -0600
@@ -547,7 +547,8 @@
             bank, m_event.scheduled() ? 'Y':'N');
 
     if (req->m_msgptr) {  // don't enqueue L3 writebacks
-        enqueueToDirectory(req, Cycles(m_mem_ctl_latency + m_mem_fixed_delay));
+        enqueueToDirectory(req, Cycles(m_mem_ctl_latency + m_mem_fixed_delay +
+                           m_bank_busy_time + m_basic_bus_busy_time));
     }
     m_oldRequest[bank] = 0;
     markTfaw(rank);
# HG changeset patch
# User Joel Hestness <hestness@cs.wisc.edu>
# Date 1449098818 21600
# Node ID 303eda09549df0943f83e70ddf55c197d2168f58
# Parent  cee4758c8ec8c8d8c7f5f9af3f6e199bbc6bf683
Ruby Directory Memory: Update to map/index for GPU

In order to have a split memory hierarchy for the CPU and GPU, the
device directories (GPU directories) need to be aware of the mapping
of memory across different device directories

This patch does not affect the standard gem5 functionality of the
directory controller.

diff -r cee4758c8ec8 -r 303eda09549d src/mem/ruby/SConscript
--- a/src/mem/ruby/SConscript	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/ruby/SConscript	Wed Dec 02 17:26:58 2015 -0600
@@ -40,6 +40,7 @@
 DebugFlag('ProtocolTrace')
 DebugFlag('RubyCache')
 DebugFlag('RubyCacheTrace')
+DebugFlag('RubyDirectoryMemory')
 DebugFlag('RubyDma')
 DebugFlag('RubyGenerated')
 DebugFlag('RubyMemory')
diff -r cee4758c8ec8 -r 303eda09549d src/mem/ruby/structures/DirectoryMemory.cc
--- a/src/mem/ruby/structures/DirectoryMemory.cc	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/ruby/structures/DirectoryMemory.cc	Wed Dec 02 17:26:58 2015 -0600
@@ -28,6 +28,7 @@
 
 #include "base/intmath.hh"
 #include "debug/RubyCache.hh"
+#include "debug/RubyDirectoryMemory.hh"
 #include "debug/RubyStats.hh"
 #include "mem/ruby/slicc_interface/RubySlicc_Util.hh"
 #include "mem/ruby/structures/DirectoryMemory.hh"
@@ -40,6 +41,10 @@
 uint64_t DirectoryMemory::m_total_size_bytes = 0;
 int DirectoryMemory::m_numa_high_bit = 0;
 
+int DirectoryMemory::m_num_dev_directories = 0;
+uint64_t DirectoryMemory::m_device_segment_base = 0;
+int DirectoryMemory::m_num_dev_directories_bits = 0;
+
 DirectoryMemory::DirectoryMemory(const Params *p)
     : SimObject(p)
 {
@@ -50,6 +55,7 @@
     m_use_map = p->use_map;
     m_map_levels = p->map_levels;
     m_numa_high_bit = p->numa_high_bit;
+    m_device_directory = p->device_directory;
 }
 
 void
@@ -67,8 +73,14 @@
         m_ram = g_system_ptr->getMemoryVector();
     }
 
-    m_num_directories++;
-    m_num_directories_bits = ceilLog2(m_num_directories);
+    if (m_device_directory) {
+        m_num_dev_directories++;
+        m_num_dev_directories_bits = ceilLog2(m_num_dev_directories);
+    } else {
+        m_num_directories++;
+        m_num_directories_bits = ceilLog2(m_num_directories);
+        m_device_segment_base += m_size_bytes;
+    }
     m_total_size_bytes += m_size_bytes;
 
     if (m_numa_high_bit == 0) {
@@ -92,13 +104,25 @@
     }
 }
 
+#define DEV_DIR_BITS 8
+
 uint64
 DirectoryMemory::mapAddressToDirectoryVersion(PhysAddress address)
 {
-    if (m_num_directories_bits == 0)
-        return 0;
-
-    uint64 ret = address.shiftLowOrderBits(m_numa_high_bit - m_num_directories_bits + 1) % m_num_directories;
+    uint64 ret;
+    if (m_num_dev_directories > 0) {
+        Addr addr = address.getAddress();
+        if (addr >= m_device_segment_base) {
+            PhysAddress relative_addr;
+            relative_addr.setAddress(addr - m_device_segment_base);
+            ret = relative_addr.shiftLowOrderBits(m_numa_high_bit - m_num_dev_directories_bits + 1) % m_num_dev_directories;
+            ret += m_num_directories;
+        } else {
+            ret = address.shiftLowOrderBits(m_numa_high_bit - m_num_directories_bits + 1) % m_num_directories;
+        }
+    } else {
+        ret = address.shiftLowOrderBits(m_numa_high_bit - m_num_directories_bits + 1) % m_num_directories;
+    }
 
     return ret;
 }
@@ -114,14 +138,36 @@
 DirectoryMemory::mapAddressToLocalIdx(PhysAddress address)
 {
     uint64 ret;
-    if (m_num_directories_bits > 0) {
-        ret = address.bitRemove(m_numa_high_bit - m_num_directories_bits + 1,
-                                m_numa_high_bit);
+    if (m_num_dev_directories > 0) {
+        if (address.getAddress() >= m_device_segment_base) {
+            PhysAddress relative_address;
+            relative_address.setAddress(address.getAddress() - m_device_segment_base);
+            if (m_num_dev_directories_bits > 0) {
+                ret = relative_address.bitRemove(m_numa_high_bit - m_num_dev_directories_bits + 1,
+                                        m_numa_high_bit);
+            } else {
+                ret = relative_address.getAddress();
+            }
+        } else {
+            if (m_num_directories_bits > 0) {
+                ret = address.bitRemove(m_numa_high_bit - m_num_directories_bits + 1,
+                                        m_numa_high_bit);
+            } else {
+                ret = address.getAddress();
+            }
+        }
     } else {
-        ret = address.getAddress();
+        if (m_num_directories_bits > 0) {
+            ret = address.bitRemove(m_numa_high_bit - m_num_directories_bits + 1,
+                                    m_numa_high_bit);
+        } else {
+            ret = address.getAddress();
+        }
     }
 
-    return ret >> (RubySystem::getBlockSizeBits());
+    ret >>= (RubySystem::getBlockSizeBits());
+    DPRINTF(RubyDirectoryMemory, "%#x, %u\n", address.getAddress(), ret);
+    return ret;
 }
 
 AbstractEntry*
diff -r cee4758c8ec8 -r 303eda09549d src/mem/ruby/structures/DirectoryMemory.hh
--- a/src/mem/ruby/structures/DirectoryMemory.hh	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/ruby/structures/DirectoryMemory.hh	Wed Dec 02 17:26:58 2015 -0600
@@ -91,6 +91,11 @@
     SparseMemory* m_sparseMemory;
     bool m_use_map;
     int m_map_levels;
+
+    bool m_device_directory;
+    static int m_num_dev_directories;
+    static int m_num_dev_directories_bits;
+    static uint64_t m_device_segment_base;
 };
 
 inline std::ostream&
diff -r cee4758c8ec8 -r 303eda09549d src/mem/ruby/structures/DirectoryMemory.py
--- a/src/mem/ruby/structures/DirectoryMemory.py	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/ruby/structures/DirectoryMemory.py	Wed Dec 02 17:26:58 2015 -0600
@@ -42,3 +42,4 @@
     # the default value of the numa high bit is specified in the command line
     # option and must be passed into the directory memory sim object
     numa_high_bit = Param.Int("numa high bit")
+    device_directory = Param.Bool(False, "this directory is for a device")
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098818 21600
# Node ID c64a1d516f200310d184be2ee26ee490f3c5c126
# Parent  303eda09549df0943f83e70ddf55c197d2168f58
Ruby: Enable slicc to get data from RubyRequest
This allows protocols to get the data out of the ruby request
before the read/writeCallback function is called. This is useful
when implementing write-through protocols where the data is needed
before the transaction is complete.

diff -r 303eda09549d -r c64a1d516f20 src/mem/protocol/RubySlicc_Types.sm
--- a/src/mem/protocol/RubySlicc_Types.sm	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/protocol/RubySlicc_Types.sm	Wed Dec 02 17:26:58 2015 -0600
@@ -123,6 +123,7 @@
   int Size,                  desc="size in bytes of access";
   PrefetchBit Prefetch,      desc="Is this a prefetch request";
   int contextId,             desc="this goes away but must be replace with Nilay";
+  void writeData(DataBlock);
 }
 
 structure(AbstractEntry, primitive="yes", external = "yes") {
diff -r 303eda09549d -r c64a1d516f20 src/mem/ruby/slicc_interface/RubyRequest.hh
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh	Wed Dec 02 17:26:58 2015 -0600
@@ -36,6 +36,7 @@
 #include "mem/protocol/RubyAccessMode.hh"
 #include "mem/protocol/RubyRequestType.hh"
 #include "mem/ruby/common/Address.hh"
+#include "mem/ruby/common/DataBlock.hh"
 
 class RubyRequest : public Message
 {
@@ -81,6 +82,12 @@
     const int& getSize() const { return m_Size; }
     const PrefetchBit& getPrefetch() const { return m_Prefetch; }
 
+    void
+    writeData(DataBlock& block) const
+    {
+      block.setData(data, m_PhysicalAddress.getOffset(), m_Size);
+    }
+
     void print(std::ostream& out) const;
     bool functionalRead(Packet *pkt);
     bool functionalWrite(Packet *pkt);
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098818 21600
# Node ID f3bb1f41536bce09c2200107e175e3a019e88086
# Parent  c64a1d516f200310d184be2ee26ee490f3c5c126
Mem: Add and implement FlushAll command in Ruby.

A FlushAll message requests the receiving controller to flush everything from
its cache.

This patch:
 - Adds a FlushAllReq/Resp MemCmd.
 - FLUSHALL RubyRequestType
 - Flash invalidate function to the Ruby cache controllers.

The flash invalidate function invalidates all lines which are resident in the
cache. There is a check to be sure that there are no lines in the busy or r/w
state, although this may need to change in the future to support a more
general flash invalidate.

diff -r c64a1d516f20 -r f3bb1f41536b src/mem/packet.cc
--- a/src/mem/packet.cc	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/packet.cc	Wed Dec 02 17:26:58 2015 -0600
@@ -174,6 +174,11 @@
     /* Invalidation Request */
     { SET3(NeedsExclusive, IsInvalidate, IsRequest),
       InvalidCmd, "InvalidationReq" },
+    /* FlushAll Request */
+    { SET4(IsRequest, NeedsResponse, IsFlush, IsInvalidate), FlushAllResp,
+      "FlushAllReq"},
+    /* FlushAll Response */
+    { SET3(IsResponse, IsFlush, IsInvalidate), InvalidCmd, "FlushAllResp"},
 };
 
 bool
diff -r c64a1d516f20 -r f3bb1f41536b src/mem/packet.hh
--- a/src/mem/packet.hh	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/packet.hh	Wed Dec 02 17:26:58 2015 -0600
@@ -121,6 +121,8 @@
         FlushReq,      //request for a cache flush
         FlushResp,
         InvalidationReq,   // request for address to be invalidated from lsq
+        FlushAllReq,      // Flush entire cache request
+        FlushAllResp,
         NUM_MEM_CMDS
     };
 
diff -r c64a1d516f20 -r f3bb1f41536b src/mem/protocol/RubySlicc_Exports.sm
--- a/src/mem/protocol/RubySlicc_Exports.sm	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/protocol/RubySlicc_Exports.sm	Wed Dec 02 17:26:58 2015 -0600
@@ -135,6 +135,7 @@
   COMMIT,            desc="Commit version";
   NULL,              desc="Invalid request type";
   FLUSH,             desc="Flush request type";
+  FLUSHALL,          desc="Flush everything from the cache";
 }
 
 enumeration(SequencerRequestType, desc="...", default="SequencerRequestType_NULL") {
diff -r c64a1d516f20 -r f3bb1f41536b src/mem/protocol/RubySlicc_Types.sm
--- a/src/mem/protocol/RubySlicc_Types.sm	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/protocol/RubySlicc_Types.sm	Wed Dec 02 17:26:58 2015 -0600
@@ -153,6 +153,7 @@
   void setMRU(Address);
   void recordRequestType(CacheRequestType);
   bool checkResourceAvailable(CacheResourceType, Address);
+  void flashInvalidate();
 
   Scalar demand_misses;
   Scalar demand_hits;
diff -r c64a1d516f20 -r f3bb1f41536b src/mem/ruby/structures/CacheMemory.cc
--- a/src/mem/ruby/structures/CacheMemory.cc	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/ruby/structures/CacheMemory.cc	Wed Dec 02 17:26:58 2015 -0600
@@ -323,6 +323,24 @@
 }
 
 void
+CacheMemory::flashInvalidate()
+{
+    // NOTE: It may make sense to invalidate Read_Write data but the assert
+    //       is added for safety.
+    for (int i = 0; i < m_cache_num_sets; i++) {
+        for (int j = 0; j < m_cache_assoc; j++) {
+            if (m_cache[i][j] == NULL) {
+                continue;
+            }
+            assert(m_cache[i][j]->m_Permission != AccessPermission_Busy);
+            assert(m_cache[i][j]->m_Permission != AccessPermission_Read_Write);
+            m_cache[i][j]->changePermission(AccessPermission_NotPresent);
+        }
+    }
+    m_tag_index.clear();
+}
+
+void
 CacheMemory::recordCacheContents(int cntrl, CacheRecorder* tr) const
 {
     uint64 warmedUpBlocks = 0;
diff -r c64a1d516f20 -r f3bb1f41536b src/mem/ruby/structures/CacheMemory.hh
--- a/src/mem/ruby/structures/CacheMemory.hh	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/ruby/structures/CacheMemory.hh	Wed Dec 02 17:26:58 2015 -0600
@@ -102,6 +102,8 @@
     void clearLocked (const Address& addr);
     bool isLocked (const Address& addr, int context);
 
+    void flashInvalidate();
+
     // Print cache contents
     void print(std::ostream& out) const;
     void printData(std::ostream& out) const;
diff -r c64a1d516f20 -r f3bb1f41536b src/mem/ruby/system/Sequencer.cc
--- a/src/mem/ruby/system/Sequencer.cc	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/ruby/system/Sequencer.cc	Wed Dec 02 17:26:58 2015 -0600
@@ -235,7 +235,8 @@
         (request_type == RubyRequestType_Store_Conditional) ||
         (request_type == RubyRequestType_Locked_RMW_Read) ||
         (request_type == RubyRequestType_Locked_RMW_Write) ||
-        (request_type == RubyRequestType_FLUSH)) {
+        (request_type == RubyRequestType_FLUSH) ||
+        (request_type == RubyRequestType_FLUSHALL)) {
 
         // Check if there is any outstanding read request for the same
         // cache line.
@@ -445,7 +446,8 @@
            (request->m_type == RubyRequestType_Store_Conditional) ||
            (request->m_type == RubyRequestType_Locked_RMW_Read) ||
            (request->m_type == RubyRequestType_Locked_RMW_Write) ||
-           (request->m_type == RubyRequestType_FLUSH));
+           (request->m_type == RubyRequestType_FLUSH) ||
+           (request->m_type == RubyRequestType_FLUSHALL));
 
     //
     // For Alpha, properly handle LL, SC, and write requests with respect to
@@ -485,7 +487,8 @@
     markRemoved();
 
     assert((request->m_type == RubyRequestType_LD) ||
-           (request->m_type == RubyRequestType_IFETCH));
+           (request->m_type == RubyRequestType_IFETCH) ||
+           (request->m_type == RubyRequestType_FLUSHALL));
 
     hitCallback(request, data, true, mach, externalHit,
                 initialRequestTime, forwardRequestTime, firstResponseTime);
@@ -652,7 +655,11 @@
             //
             primary_type = secondary_type = RubyRequestType_ST;
         } else if (pkt->isFlush()) {
-          primary_type = secondary_type = RubyRequestType_FLUSH;
+            if (pkt->cmd == MemCmd::FlushAllReq) {
+                primary_type = secondary_type = RubyRequestType_FLUSHALL;
+            } else {
+                primary_type = secondary_type = RubyRequestType_FLUSH;
+            }
         } else {
             panic("Unsupported ruby packet type\n");
         }
# HG changeset patch
# User Joel Hestness <jthestness@gmail.com>
# Date 1449098818 21600
# Node ID 01ee78aff9672274cc1fdd177b0034aad619fa37
# Parent  f3bb1f41536bce09c2200107e175e3a019e88086
mem: Add a memory command for fences

To communicate fence requests between the CudaCore and the ShaderLSQ, we need
to add a packet MemCmd type.

This patch will be used by follow-on patches to gem5-gpu.

diff -r f3bb1f41536b -r 01ee78aff967 src/mem/packet.cc
--- a/src/mem/packet.cc	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/packet.cc	Wed Dec 02 17:26:58 2015 -0600
@@ -179,6 +179,10 @@
       "FlushAllReq"},
     /* FlushAll Response */
     { SET3(IsResponse, IsFlush, IsInvalidate), InvalidCmd, "FlushAllResp"},
+    /* Fence Request */
+    { SET2(IsRequest, NeedsResponse), FenceResp, "FenceReq"},
+    /* Fence Response */
+    { SET1(IsResponse), InvalidCmd, "FenceResp"},
 };
 
 bool
diff -r f3bb1f41536b -r 01ee78aff967 src/mem/packet.hh
--- a/src/mem/packet.hh	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/packet.hh	Wed Dec 02 17:26:58 2015 -0600
@@ -123,6 +123,8 @@
         InvalidationReq,   // request for address to be invalidated from lsq
         FlushAllReq,      // Flush entire cache request
         FlushAllResp,
+        FenceReq,       // Enforce memory access ordering based on pkt contents
+        FenceResp,      // Fence operation has completed
         NUM_MEM_CMDS
     };
 
# HG changeset patch
# User Joel Hestness <hestness@cs.wisc.edu>
# Date 1449098819 21600
# Node ID cc0e36279258b7abec4962aaf2f472a9aa9d1820
# Parent  01ee78aff9672274cc1fdd177b0034aad619fa37
Ruby Memory Controller: Remove refresh deadline

With longer latency memory requests that come with more accurate modeling of
the bus contention, the prior refresh deadlines no longer make sense. Add a
warning when refresh time exceeds 500 cycles, but do not kill simulation.

TODO: If accurate refresh modeling becomes desirable, this should be fixed to
ensure refresh happens promptly.

diff -r 01ee78aff967 -r cc0e36279258 src/mem/ruby/structures/RubyMemoryControl.cc
--- a/src/mem/ruby/structures/RubyMemoryControl.cc	Wed Dec 02 17:26:58 2015 -0600
+++ b/src/mem/ruby/structures/RubyMemoryControl.cc	Wed Dec 02 17:26:59 2015 -0600
@@ -597,7 +597,7 @@
         m_refresh_count = m_refresh_period_system;
 
         // Are we overrunning our ability to refresh?
-        assert(m_need_refresh < 10);
+        if (m_need_refresh >= 500) warn_once("Refresh delayed more than 500 cycles!\n");
         m_need_refresh++;
     }
 
# HG changeset patch
# User Joel Hestness <jthestness@gmail.com>
# Date 1449098819 21600
# Node ID b58aad0daaf4cdb1c09e262d9739fd094ca2fdc9
# Parent  cc0e36279258b7abec4962aaf2f472a9aa9d1820
ruby: Generalize the Cluster network

This patch adds two generalizations to the Cluster network:
1) In the case that components from one Cluster should be connected to
components within another Cluster, we can add those components to both
Clusters. This creates a problem if these two partially-connected Clusters are
included as sub-Clusters of another Cluster, because the recursive definition
causes the shared portions of the sub-Clusters to be traversed for each path
entering the shared portions. Add a check to see if the sub-Cluster has already
been instantiated, and if so, simply return to the super-Cluster that is
calling the makeTopology function.

2) In the case that multiple sub-Clusters should be connected but the router
within one should not be connected to the top-level Cluster (e.g. a tree
hierarchy with shared components between subtrees), the Cluster would, by
default, add a link between the sub-Cluster router and the top-level Cluster.
Add a variable to specify whether the sub-Cluster's router should be connected
to the top-level Cluster's router. Default the choice to True.

diff -r cc0e36279258 -r b58aad0daaf4 configs/topologies/Cluster.py
--- a/configs/topologies/Cluster.py	Wed Dec 02 17:26:59 2015 -0600
+++ b/configs/topologies/Cluster.py	Wed Dec 02 17:26:59 2015 -0600
@@ -69,14 +69,29 @@
         self.extBW = extBW
         self.intLatency = intLatency
         self.extLatency = extLatency
+        self.connectToParent = True
 
     def add(self, node):
         self.nodes.append(node)
 
+    # Since Clusters may be recursively defined, it may be desirable to nest
+    # Clusters without connecting them to higher-level parts of the network
+    # Use disableConnectToParent() to keep a Cluster from being connected
+    # to the router of a Cluster that contains it.
+    def getConnectToParent(self):
+        return self.connectToParent
+
+    def disableConnectToParent(self):
+        self.connectToParent = False
+
     def makeTopology(self, options, network, IntLink, ExtLink, Router):
         """ Recursively make all of the links and routers
         """
 
+        # If this sub-Cluster has already been constructed
+        if self.router is not None:
+            return
+
         # make a router to connect all of the nodes
         self.router = Router(router_id=self.num_routers())
         network.routers.append(self.router)
@@ -85,24 +100,25 @@
             if type(node) == Cluster:
                 node.makeTopology(options, network, IntLink, ExtLink, Router)
 
-                # connect this cluster to the router
-                link = IntLink(link_id=self.num_int_links(), node_a=self.router,
-                        node_b=node.router)
+                if node.getConnectToParent():
+                    # connect this cluster to the router
+                    link = IntLink(link_id=self.num_int_links(),
+                                   node_a=self.router, node_b=node.router)
 
-                if node.extBW:
-                    link.bandwidth_factor = node.extBW
+                    if node.extBW:
+                        link.bandwidth_factor = node.extBW
 
-                # if there is an interanl b/w for this node
-                # and no ext b/w to override
-                elif self.intBW:
-                    link.bandwidth_factor = self.intBW
+                    # if there is an interanl b/w for this node
+                    # and no ext b/w to override
+                    elif self.intBW:
+                        link.bandwidth_factor = self.intBW
 
-                if node.extLatency:
-                    link.latency = node.extLatency
-                elif self.intLatency:
-                    link.latency = self.intLatency
+                    if node.extLatency:
+                        link.latency = node.extLatency
+                    elif self.intLatency:
+                        link.latency = self.intLatency
 
-                network.int_links.append(link)
+                    network.int_links.append(link)
             else:
                 # node is just a controller,
                 # connect it to the router via a ext_link
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098819 21600
# Node ID e4786d785afefce8fa4cb98de330271e5a4fd543
# Parent  b58aad0daaf4cdb1c09e262d9739fd094ca2fdc9
Add flag to TLB to optionally bypass L1

Uses bypass option in Ruby to bypass the L1 cache for TLB accesses.
This may be useful for systems with a small L1 cache that you do not
want polluted by page walks.

diff -r b58aad0daaf4 -r e4786d785afe src/arch/x86/X86TLB.py
--- a/src/arch/x86/X86TLB.py	Wed Dec 02 17:26:59 2015 -0600
+++ b/src/arch/x86/X86TLB.py	Wed Dec 02 17:26:59 2015 -0600
@@ -49,6 +49,9 @@
     system = Param.System(Parent.any, "system object")
     num_squash_per_cycle = Param.Unsigned(4,
             "Number of outstanding walks that can be squashed per cycle")
+    bypass_l1 = Param.Bool(False, "Bypass the L1 cache when issuing memory \
+                                   accesses for pagetable walks. Useful for \
+                                   caches that may hold stale data.")
 
 class X86TLB(BaseTLB):
     type = 'X86TLB'
diff -r b58aad0daaf4 -r e4786d785afe src/arch/x86/pagetable_walker.cc
--- a/src/arch/x86/pagetable_walker.cc	Wed Dec 02 17:26:59 2015 -0600
+++ b/src/arch/x86/pagetable_walker.cc	Wed Dec 02 17:26:59 2015 -0600
@@ -577,6 +577,9 @@
     entry.vaddr = vaddr;
 
     Request::Flags flags = Request::PHYSICAL;
+    if (walker->bypassL1) {
+        flags.set(Request::BYPASS_L1);
+    }
     if (cr3.pcd)
         flags.set(Request::UNCACHEABLE);
     RequestPtr request = new Request(topAddr, dataSize, flags,
diff -r b58aad0daaf4 -r e4786d785afe src/arch/x86/pagetable_walker.hh
--- a/src/arch/x86/pagetable_walker.hh	Wed Dec 02 17:26:59 2015 -0600
+++ b/src/arch/x86/pagetable_walker.hh	Wed Dec 02 17:26:59 2015 -0600
@@ -180,6 +180,9 @@
         // The number of outstanding walks that can be squashed per cycle.
         unsigned numSquashable;
 
+        // If true, send all memory requests with the bypass L1 flag true
+        bool bypassL1;
+
         // Wrapper for checking for squashes before starting a translation.
         void startWalkWrapper();
 
@@ -207,7 +210,8 @@
             MemObject(params), port(name() + ".port", this),
             funcState(this, NULL, NULL, true), tlb(NULL), sys(params->system),
             masterId(sys->getMasterId(name())),
-            numSquashable(params->num_squash_per_cycle)
+            numSquashable(params->num_squash_per_cycle),
+            bypassL1(params->bypass_l1)
         {
         }
     };
diff -r b58aad0daaf4 -r e4786d785afe src/mem/request.hh
--- a/src/mem/request.hh	Wed Dec 02 17:26:59 2015 -0600
+++ b/src/mem/request.hh	Wed Dec 02 17:26:59 2015 -0600
@@ -140,6 +140,8 @@
     static const FlagsType PF_EXCLUSIVE                = 0x02000000;
     /** The request should be marked as LRU. */
     static const FlagsType EVICT_NEXT                  = 0x04000000;
+    /** The request should bypass the L1 cache. */
+    static const FlagsType BYPASS_L1                   = 0x08000000;
 
     /** The request should be handled by the generic IPR code (only
      * valid together with MMAPPED_IPR) */
@@ -651,6 +653,7 @@
     bool isClearLL() const { return _flags.isSet(CLEAR_LL); }
     bool isSecure() const { return _flags.isSet(SECURE); }
     bool isPTWalk() const { return _flags.isSet(PT_WALK); }
+    bool isBypassL1() const { return _flags.isSet(BYPASS_L1); }
 };
 
 #endif // __MEM_REQUEST_HH__
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098819 21600
# Node ID 10ea2c2a71b0890783a59a3b70b8e9365cd76c14
# Parent  e4786d785afefce8fa4cb98de330271e5a4fd543
Ruby: Add request type to ruby that bypasses the L1

diff -r e4786d785afe -r 10ea2c2a71b0 src/mem/protocol/RubySlicc_Exports.sm
--- a/src/mem/protocol/RubySlicc_Exports.sm	Wed Dec 02 17:26:59 2015 -0600
+++ b/src/mem/protocol/RubySlicc_Exports.sm	Wed Dec 02 17:26:59 2015 -0600
@@ -136,6 +136,8 @@
   NULL,              desc="Invalid request type";
   FLUSH,             desc="Flush request type";
   FLUSHALL,          desc="Flush everything from the cache";
+  LD_Bypass,         desc="Load, but bypass the L1";
+  ST_Bypass,         desc="Store, but bypass the L1";
 }
 
 enumeration(SequencerRequestType, desc="...", default="SequencerRequestType_NULL") {
diff -r e4786d785afe -r 10ea2c2a71b0 src/mem/ruby/system/Sequencer.cc
--- a/src/mem/ruby/system/Sequencer.cc	Wed Dec 02 17:26:59 2015 -0600
+++ b/src/mem/ruby/system/Sequencer.cc	Wed Dec 02 17:26:59 2015 -0600
@@ -488,6 +488,7 @@
 
     assert((request->m_type == RubyRequestType_LD) ||
            (request->m_type == RubyRequestType_IFETCH) ||
+           (request->m_type == RubyRequestType_LD_Bypass) ||
            (request->m_type == RubyRequestType_FLUSHALL));
 
     hitCallback(request, data, true, mach, externalHit,
@@ -536,6 +537,7 @@
                      request_address.getOffset(), pkt->getSize());
     } else if (pkt->getPtr<uint8_t>(true) != NULL) {
         if ((type == RubyRequestType_LD) ||
+            (type == RubyRequestType_LD_Bypass) ||
             (type == RubyRequestType_IFETCH) ||
             (type == RubyRequestType_RMW_Read) ||
             (type == RubyRequestType_Locked_RMW_Read) ||
@@ -646,14 +648,22 @@
                     primary_type = RubyRequestType_RMW_Read;
                     secondary_type = RubyRequestType_ST;
                 } else {
-                    primary_type = secondary_type = RubyRequestType_LD;
+                    if (pkt->req->isBypassL1()) {
+                        primary_type = secondary_type = RubyRequestType_LD_Bypass;
+                    } else {
+                        primary_type = secondary_type = RubyRequestType_LD;
+                    }
                 }
             }
         } else if (pkt->isWrite()) {
             //
             // Note: M5 packets do not differentiate ST from RMW_Write
             //
-            primary_type = secondary_type = RubyRequestType_ST;
+            if (pkt->req->isBypassL1()) {
+                primary_type = secondary_type = RubyRequestType_ST_Bypass;
+            } else {
+                primary_type = secondary_type = RubyRequestType_ST;
+            }
         } else if (pkt->isFlush()) {
             if (pkt->cmd == MemCmd::FlushAllReq) {
                 primary_type = secondary_type = RubyRequestType_FLUSHALL;
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098820 21600
# Node ID a2d8a04da505d79092627e164893413f99327750
# Parent  10ea2c2a71b0890783a59a3b70b8e9365cd76c14
Adds architectural changes required to handle GPU page faults.

This patch makes  changes:
 1) Adds a GPU page fault register to the CPU core. This is not architecturally
    visible. This register holds the state of the GPU page fault. Either
    0 => Not handling a GPU page fault, or 1 => currently handling a GPU page
    fault. This register is set by the GPU device MMU before raising a page
    fault interrupt.
 2) Modifies the iret instruction's microcode. Now, when returning from an
    interrupt, check the GPU page fault register. If the register is 1, then
    notify the GPU MMU that it's possible a GPU page fault has completed.
 3) Adds a gpufaultfinish psuedo-instruction and the microcode implementation.
    This instruction calls a function on the GPU, gpuFinishPageFault. This
    function is implemented in gem5-gpu.

diff -r 10ea2c2a71b0 -r a2d8a04da505 src/arch/x86/isa/insts/general_purpose/control_transfer/interrupts_and_exceptions.py
--- a/src/arch/x86/isa/insts/general_purpose/control_transfer/interrupts_and_exceptions.py	Wed Dec 02 17:26:59 2015 -0600
+++ b/src/arch/x86/isa/insts/general_purpose/control_transfer/interrupts_and_exceptions.py	Wed Dec 02 17:27:00 2015 -0600
@@ -62,7 +62,13 @@
     # Read the handy m5 register for use later
     rdm5reg t4
 
+    # check if this was a GPU fault and notify the GPU.
+    rdval t5, "InstRegIndex(MISCREG_GPU_FAULT)"
+    andi t0, t5, 1, flags=(EZF,)
+    br label("notGPUFaultFallThrough"), flags=(CEZF,)
+    gpufaultfinish
 
+notGPUFaultFallThrough:
 ###
 ### Handle if we're returning to virtual 8086 mode.
 ###
diff -r 10ea2c2a71b0 -r a2d8a04da505 src/arch/x86/isa/microops/gpu.isa
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/arch/x86/isa/microops/gpu.isa	Wed Dec 02 17:27:00 2015 -0600
@@ -0,0 +1,91 @@
+// Copyright (c) 2013 Mark D. Hill and David A. Wood
+// All rights reserved.
+//
+// The license below extends only to copyright in the software and shall
+// not be construed as granting a license to any other intellectual
+// property including but not limited to intellectual property relating
+// to a hardware implementation of the functionality of the software
+// licensed hereunder.  You may use the software subject to the license
+// terms below provided that you ensure that this notice is replicated
+// unmodified and in its entirety in all distributions of the software,
+// modified or unmodified, in source code or in binary form.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Jason Power
+
+output header {{
+        void gpuFinishPageFault(int gpuId, ThreadContext *tc);
+    class GPUFaultFinish : public X86ISA::X86MicroopBase
+    {
+      public:
+        GPUFaultFinish(ExtMachInst _machInst, const char * instMnem,
+                uint64_t setFlags) :
+            X86MicroopBase(_machInst, "gpufaultfinish", instMnem,
+                           setFlags | (ULL(1) << StaticInst::IsNonSpeculative),
+                           No_OpClass)
+        {
+        }
+
+        %(BasicExecDeclare)s
+
+        std::string generateDisassembly(Addr pc,
+                const SymbolTable *symtab) const;
+    };
+}};
+
+output exec {{
+    Fault
+    GPUFaultFinish::execute(CPU_EXEC_CONTEXT *xc,
+            Trace::InstRecord * traceData) const
+    {
+        gpuFinishPageFault(0, xc->tcBase());
+        return NoFault;
+    }
+}};
+
+output decoder {{
+    std::string GPUFaultFinish::generateDisassembly(Addr pc,
+            const SymbolTable *symtab) const
+    {
+        std::stringstream response;
+
+        printMnemonic(response, instMnem, mnemonic);
+
+        return response.str();
+    }
+}};
+
+let {{
+    class GPUFaultFinish(X86Microop):
+        className = "GPUFaultFinish"
+        def __init__(self):
+            pass
+
+        def getAllocator(self, microFlags):
+            return "new GPUFaultFinish(machInst, macrocodeBlock, %s)" % \
+                    self.microFlagsText(microFlags)
+
+    microopClasses["gpufaultfinish"] = GPUFaultFinish
+}};
diff -r 10ea2c2a71b0 -r a2d8a04da505 src/arch/x86/isa/microops/microops.isa
--- a/src/arch/x86/isa/microops/microops.isa	Wed Dec 02 17:26:59 2015 -0600
+++ b/src/arch/x86/isa/microops/microops.isa	Wed Dec 02 17:27:00 2015 -0600
@@ -61,3 +61,6 @@
 
 //Microops for printing out debug messages through M5
 ##include "debug.isa"
+
+//Microops for interacting with the GPU
+##include "gpu.isa"
diff -r 10ea2c2a71b0 -r a2d8a04da505 src/arch/x86/regs/misc.hh
--- a/src/arch/x86/regs/misc.hh	Wed Dec 02 17:26:59 2015 -0600
+++ b/src/arch/x86/regs/misc.hh	Wed Dec 02 17:27:00 2015 -0600
@@ -396,6 +396,9 @@
         // "Fake" MSRs for internally implemented devices
         MISCREG_PCI_CONFIG_ADDRESS,
 
+        // GPU fault register
+        MISCREG_GPU_FAULT,
+
         NUM_MISCREGS
     };
 
@@ -937,6 +940,14 @@
         Bitfield<11> enable;
         Bitfield<8> bsp;
     EndBitUnion(LocalApicBase)
+
+    /**
+     * Register for active GPU page fault
+     * May need to increase to more bits if more than 1 GPU is in the system
+     */
+    BitUnion64(GPUFaultReg)
+        Bitfield<0> inFault;
+    EndBitUnion(GPUFaultReg)
 }
 
 #endif // __ARCH_X86_INTREGS_HH__
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098820 21600
# Node ID 6fcd842605cef42c25a7e02ef3acf8a50735eae4
# Parent  a2d8a04da505d79092627e164893413f99327750
Adds an interrupt to x86 that corresponds to a GPU page fault.

diff -r a2d8a04da505 -r 6fcd842605ce src/arch/x86/interrupts.cc
--- a/src/arch/x86/interrupts.cc	Wed Dec 02 17:27:00 2015 -0600
+++ b/src/arch/x86/interrupts.cc	Wed Dec 02 17:27:00 2015 -0600
@@ -269,7 +269,9 @@
     } else if (!DeliveryMode::isReserved(deliveryMode)) {
         DPRINTF(LocalApic, "Interrupt is an %s.\n",
                 DeliveryMode::names[deliveryMode]);
-        if (deliveryMode == DeliveryMode::SMI && !pendingSmi) {
+        if (deliveryMode == DeliveryMode::GPUFault) {
+            pendingUnmaskableInt = pendingGpu = true;
+        } else if (deliveryMode == DeliveryMode::SMI && !pendingSmi) {
             pendingUnmaskableInt = pendingSmi = true;
             smiVector = vector;
         } else if (deliveryMode == DeliveryMode::NMI && !pendingNmi) {
@@ -609,7 +611,7 @@
 
 X86ISA::Interrupts::Interrupts(Params * p)
     : BasicPioDevice(p, PageBytes), IntDevice(this, p->int_latency),
-      apicTimerEvent(this),
+      apicTimerEvent(this), pendingGpu(false),
       pendingSmi(false), smiVector(0),
       pendingNmi(false), nmiVector(0),
       pendingExtInt(false), extIntVector(0),
@@ -664,7 +666,13 @@
     // These are all probably fairly uncommon, so we'll make them easier to
     // check for.
     if (pendingUnmaskableInt) {
-        if (pendingSmi) {
+        if (pendingGpu) {
+            DPRINTF(LocalApic, "Generated GPU page fault object.\n");
+            Addr addr = tc->readMiscRegNoEffect(MISCREG_GPU_FAULTADDR);
+            uint32_t code = tc->readMiscRegNoEffect(MISCREG_GPU_FAULTCODE);
+            assert(((GPUFaultReg)tc->readMiscRegNoEffect(MISCREG_GPU_FAULT)).inFault == 1);
+            return new PageFault(addr, code);
+        } else if (pendingSmi) {
             DPRINTF(LocalApic, "Generated SMI fault object.\n");
             return new SystemManagementInterrupt();
         } else if (pendingNmi) {
diff -r a2d8a04da505 -r 6fcd842605ce src/arch/x86/interrupts.hh
--- a/src/arch/x86/interrupts.hh	Wed Dec 02 17:27:00 2015 -0600
+++ b/src/arch/x86/interrupts.hh	Wed Dec 02 17:27:00 2015 -0600
@@ -117,6 +117,7 @@
      * A set of variables to keep track of interrupts that don't go through
      * the IRR.
      */
+    bool pendingGpu;
     bool pendingSmi;
     uint8_t smiVector;
     bool pendingNmi;
@@ -231,6 +232,12 @@
         return entry.periodic;
     }
 
+    void
+    triggerGPUInterrupt()
+    {
+        requestInterrupt(0, DeliveryMode::GPUFault, false);
+    }
+
     AddrRangeList getIntAddrRange() const;
 
     BaseMasterPort &getMasterPort(const std::string &if_name,
diff -r a2d8a04da505 -r 6fcd842605ce src/arch/x86/intmessage.hh
--- a/src/arch/x86/intmessage.hh	Wed Dec 02 17:27:00 2015 -0600
+++ b/src/arch/x86/intmessage.hh	Wed Dec 02 17:27:00 2015 -0600
@@ -59,12 +59,13 @@
             INIT = 5,
             SIPI = 6,
             ExtInt = 7,
+            GPUFault = 8,
             NumModes
         };
 
         static const char * const names[NumModes] = {
             "Fixed", "LowestPriority", "SMI", "Reserved",
-            "NMI", "INIT", "Startup", "ExtInt"
+            "NMI", "INIT", "Startup", "ExtInt", "GPUFault"
         };
 
         static inline bool
diff -r a2d8a04da505 -r 6fcd842605ce src/arch/x86/regs/misc.hh
--- a/src/arch/x86/regs/misc.hh	Wed Dec 02 17:27:00 2015 -0600
+++ b/src/arch/x86/regs/misc.hh	Wed Dec 02 17:27:00 2015 -0600
@@ -398,6 +398,8 @@
 
         // GPU fault register
         MISCREG_GPU_FAULT,
+        MISCREG_GPU_FAULTADDR,
+        MISCREG_GPU_FAULTCODE,
 
         NUM_MISCREGS
     };
@@ -948,6 +950,14 @@
     BitUnion64(GPUFaultReg)
         Bitfield<0> inFault;
     EndBitUnion(GPUFaultReg)
+
+    BitUnion64(GPUFaultCode)
+        Bitfield<0> present;
+        Bitfield<1> write;
+        Bitfield<2> user;
+        Bitfield<3> reserved;
+        Bitfield<4> fetch;
+    EndBitUnion(GPUFaultCode)
 }
 
 #endif // __ARCH_X86_INTREGS_HH__
# HG changeset patch
# User Joel Hestness <jthestness@gmail.com>
# Date 1449098820 21600
# Node ID 066c06231f9f1bfcae610963c1e6f3126e52407f
# Parent  6fcd842605cef42c25a7e02ef3acf8a50735eae4
ARM: Add GPU fault registers

These are required to get gem5-gpu to compile and still have mostly
ISA-agnostic code in the ShaderMMU.

diff -r 6fcd842605ce -r 066c06231f9f src/arch/arm/miscregs.hh
--- a/src/arch/arm/miscregs.hh	Wed Dec 02 17:27:00 2015 -0600
+++ b/src/arch/arm/miscregs.hh	Wed Dec 02 17:27:00 2015 -0600
@@ -672,7 +672,12 @@
         MISCREG_A64_UNIMPL,             // 603
         MISCREG_UNKNOWN,                // 604
 
-        NUM_MISCREGS                    // 605
+        // GPU fault register
+        MISCREG_GPU_FAULT,              // 605
+        MISCREG_GPU_FAULTADDR,          // 606
+        MISCREG_GPU_FAULTCODE,          // 607
+
+        NUM_MISCREGS                    // 608
     };
 
     enum MiscRegInfo {
@@ -1349,7 +1354,12 @@
         "cp14_unimpl",
         "cp15_unimpl",
         "a64_unimpl",
-        "unknown"
+        "unknown",
+
+        // GPU fault registers
+        "gpuf",
+        "gpufaddr",
+        "gpufcode"
     };
 
     static_assert(sizeof(miscRegName) / sizeof(*miscRegName) == NUM_MISCREGS,
@@ -1839,6 +1849,21 @@
         Bitfield<9, 0> res1_9_0_el2;
    EndBitUnion(CPTR)
 
+   /**
+   * Register for active GPU page fault
+   * May need to increase to more bits if more than 1 GPU is in the system
+   */
+   BitUnion64(GPUFaultReg)
+      Bitfield<0> inFault;
+   EndBitUnion(GPUFaultReg)
+
+   BitUnion64(GPUFaultCode)
+      Bitfield<0> present;
+      Bitfield<1> write;
+      Bitfield<2> user;
+      Bitfield<3> reserved;
+      Bitfield<4> fetch;
+   EndBitUnion(GPUFaultCode)
 
     // Checks read access permissions to coproc. registers
     bool canReadCoprocReg(MiscRegIndex reg, SCR scr, CPSR cpsr,
# HG changeset patch
# User Joel Hestness <jthestness@gmail.com>
# Date 1449098820 21600
# Node ID 79754ba05c2788ea3ec42dcee3c2d2c96ab215b1
# Parent  066c06231f9f1bfcae610963c1e6f3126e52407f
ruby: Parameterize connect to IO bus

Changes in gem5 (rev. 10116) now automatically connect Ruby sequencers to the
IO bus, though not all sequencers are associated with IO-capable controllers
(e.g. GPU, copy engine and page walk cache). Add a parameter to sequencers to
disable connection to the IO bus. This param must be set to False by sequencers
instantiated in gem5-gpu protocol config files.

Note: This is a stop-gap fix until further decisions about GPU, copy engine IO
capabilities are made

diff -r 066c06231f9f -r 79754ba05c27 configs/ruby/Ruby.py
--- a/configs/ruby/Ruby.py	Wed Dec 02 17:27:00 2015 -0600
+++ b/configs/ruby/Ruby.py	Wed Dec 02 17:27:00 2015 -0600
@@ -199,11 +199,15 @@
     # Connect the cpu sequencers and the piobus
     if piobus != None:
         for cpu_seq in cpu_sequencers:
-            cpu_seq.pio_master_port = piobus.slave
-            cpu_seq.mem_master_port = piobus.slave
+            # gem5-gpu: This is parameterized to not connect components that
+            # are unable to handle IO messages. This is a stop-gap fix until
+            # further decisions about GPU, copy engine IO capabilities are made
+            if cpu_seq.connect_to_io:
+                cpu_seq.pio_master_port = piobus.slave
+                cpu_seq.mem_master_port = piobus.slave
 
-            if buildEnv['TARGET_ISA'] == "x86":
-                cpu_seq.pio_slave_port = piobus.master
+                if buildEnv['TARGET_ISA'] == "x86":
+                    cpu_seq.pio_slave_port = piobus.master
 
     ruby._cpu_ports = cpu_sequencers
     ruby.num_of_sequencers = len(cpu_sequencers)
diff -r 066c06231f9f -r 79754ba05c27 src/mem/ruby/system/Sequencer.py
--- a/src/mem/ruby/system/Sequencer.py	Wed Dec 02 17:27:00 2015 -0600
+++ b/src/mem/ruby/system/Sequencer.py	Wed Dec 02 17:27:00 2015 -0600
@@ -70,6 +70,10 @@
     deadlock_threshold = Param.Cycles(500000,
         "max outstanding cycles for a request before deadlock/livelock declared")
     using_network_tester = Param.Bool(False, "")
+    # gem5-gpu: This is parameterized to not connect components that
+    # are unable to handle IO messages. This is a stop-gap fix until
+    # further decisions about GPU, copy engine IO capabilities are made
+    connect_to_io = Param.Bool(True, "Whether to connect to IO")
 
 class DMASequencer(RubyPort):
     type = 'DMASequencer'
# HG changeset patch
# User Joel Hestness <jthestness@gmail.com>
# Date 1449098820 21600
# Node ID 7ff5f0cec9babca95ba23f9bfc1103bc80c467b0
# Parent  79754ba05c2788ea3ec42dcee3c2d2c96ab215b1
HACKY! Allow Ruby to Proceed on Func Access Fail

When accessing memory functionally, if packets should access the cannonical
physical memory anyway (access_phys_mem), there is no need to fail in the event
that the cache state is incorrect, because the backing store will be accessed
AND hold correct data. Put in a check to dodge this for now.

NOTE! THIS PATCH SHOULD BE USED WITH CAUTION:
This patch has been tested and shows seemingly correct execution for all
benchmarks, but this is still considered a hack, because it side-steps
appropriate handling of data in Ruby caches. This side-stepping may result in
incorrect benchmark output and mask performance issues of illegitimately
designed cache behavior.

diff -r 79754ba05c27 -r 7ff5f0cec9ba src/mem/ruby/system/RubyPort.cc
--- a/src/mem/ruby/system/RubyPort.cc	Wed Dec 02 17:27:00 2015 -0600
+++ b/src/mem/ruby/system/RubyPort.cc	Wed Dec 02 17:27:00 2015 -0600
@@ -308,7 +308,7 @@
 
     // Unless the requester explicitly said otherwise, generate an error if
     // the functional request failed
-    if (!accessSucceeded && !pkt->suppressFuncError()) {
+    if (!accessSucceeded && !pkt->suppressFuncError() && !access_phys_mem) {
         fatal("Ruby functional %s failed for address %#x\n",
               pkt->isWrite() ? "write" : "read", pkt->getAddr());
     }
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098820 21600
# Node ID a56e9b3ca22f2a1573df46bc5ad56fbf728b966a
# Parent  7ff5f0cec9babca95ba23f9bfc1103bc80c467b0
This patch is used to separate general patches for gem5-gpu and personal patches

It is needed so that when new patches are created at the end of the patch queue
they are inserted well after any general gem5 patches. This greatly eases
merging with changes to gem5-patches.

To add a non-personal patch at the end of the patch queue, pop this patch and
use qnew.

(NOTE: This patch makes no changes to gem5)

# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098820 21600
# Node ID bb8f525b8704bd6f6dc8d34eba8763110ddad44c
# Parent  a56e9b3ca22f2a1573df46bc5ad56fbf728b966a
imported patch full-system-disk-pointer

diff -r a56e9b3ca22f -r bb8f525b8704 configs/common/FSConfig.py
--- a/configs/common/FSConfig.py	Wed Dec 02 17:27:00 2015 -0600
+++ b/configs/common/FSConfig.py	Wed Dec 02 17:27:00 2015 -0600
@@ -431,7 +431,8 @@
     disk0 = CowIdeDisk(driveID='master')
     disk2 = CowIdeDisk(driveID='master')
     disk0.childImage(mdesc.disk())
-    disk2.childImage(disk('linux-bigswap2.img'))
+    #disk2.childImage(disk('linux-bigswap2.img'))
+    disk2.childImage(disk('working.img'))
     self.pc.south_bridge.ide.disks = [disk0, disk2]
 
     # Add in a Bios information structure.
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098821 21600
# Node ID e833d880e04b389502e00199db2463e234c0dcd0
# Parent  bb8f525b8704bd6f6dc8d34eba8763110ddad44c
imported patch gdb-flags

diff -r bb8f525b8704 -r e833d880e04b SConstruct
--- a/SConstruct	Wed Dec 02 17:27:00 2015 -0600
+++ b/SConstruct	Wed Dec 02 17:27:01 2015 -0600
@@ -539,6 +539,7 @@
     # As gcc and clang share many flags, do the common parts here
     main.Append(CCFLAGS=['-pipe'])
     main.Append(CCFLAGS=['-fno-strict-aliasing'])
+    main.Append(CCFLAGS=['-gdwarf-3'])
     # Enable -Wall and then disable the few warnings that we
     # consistently violate
     main.Append(CCFLAGS=['-Wall', '-Wno-sign-compare', '-Wundef'])
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098821 21600
# Node ID 2db91b90e03933b343bfab3341c103848f34346c
# Parent  e833d880e04b389502e00199db2463e234c0dcd0
Add a better error when there is duplicate virtual networks in an .sm file

diff -r e833d880e04b -r 2db91b90e039 src/mem/slicc/symbols/StateMachine.py
--- a/src/mem/slicc/symbols/StateMachine.py	Wed Dec 02 17:27:01 2015 -0600
+++ b/src/mem/slicc/symbols/StateMachine.py	Wed Dec 02 17:27:01 2015 -0600
@@ -567,7 +567,9 @@
                     vnet = var["virtual_network"]
                     vnet_type = var["vnet_type"]
 
-                    assert (vnet, network) not in vnet_dir_set
+                    if (vnet, network) in vnet_dir_set:
+                        #print vnet_dir_set
+                        self.error("Duplicate entry for vnet(%s) and network(%s)" % (str(vnet), str(network)))
                     vnet_dir_set.add((vnet,network))
 
                     code('''
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098821 21600
# Node ID d63c91ba5966b4c10652fa9d6eaa2e5933d6d86d
# Parent  2db91b90e03933b343bfab3341c103848f34346c
Change error to warning if the C++ to python class mapping doesn't exist

diff -r 2db91b90e039 -r d63c91ba5966 src/mem/slicc/symbols/StateMachine.py
--- a/src/mem/slicc/symbols/StateMachine.py	Wed Dec 02 17:27:01 2015 -0600
+++ b/src/mem/slicc/symbols/StateMachine.py	Wed Dec 02 17:27:01 2015 -0600
@@ -220,9 +220,12 @@
                 code('${{param.ident}} = Param.${{python_type}}(${dflt_str}"")')
 
             else:
-                self.error("Unknown c++ to python class conversion for c++ " \
-                           "type: '%s'. Please update the python_class_map " \
-                           "in StateMachine.py", param.type_ast.type.c_ident)
+                self.warning("Unknown c++ to python class conversion for c++ " \
+                          "type: '%s'. Please update the python_class_map " \
+                          "in StateMachine.py\nAssuming same name for python "\
+                          "class", param.type_ast.type.c_ident)
+                python_type = param.type_ast.type.c_ident
+                code('${{param.ident}} = Param.${{python_type}}(${dflt_str}"")')
         code.dedent()
         code.write(path, '%s.py' % py_ident)
         
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098821 21600
# Node ID 755143994611e3d42599f4b152e76a76ac8090b7
# Parent  d63c91ba5966b4c10652fa9d6eaa2e5933d6d86d
imported patch lena

diff -r d63c91ba5966 -r 755143994611 configs/common/Benchmarks.py
--- a/configs/common/Benchmarks.py	Wed Dec 02 17:27:01 2015 -0600
+++ b/configs/common/Benchmarks.py	Wed Dec 02 17:27:01 2015 -0600
@@ -54,7 +54,7 @@
         elif buildEnv['TARGET_ISA'] == 'alpha':
             return env.get('LINUX_IMAGE', disk('linux-latest.img'))
         elif buildEnv['TARGET_ISA'] == 'x86':
-            return env.get('LINUX_IMAGE', disk('x86root.img'))
+            return env.get('LINUX_IMAGE', disk('linux-x86.img'))
         elif buildEnv['TARGET_ISA'] == 'arm':
             return env.get('LINUX_IMAGE', disk('linux-arm-ael.img'))
         else:
diff -r d63c91ba5966 -r 755143994611 configs/common/SysPaths.py
--- a/configs/common/SysPaths.py	Wed Dec 02 17:27:01 2015 -0600
+++ b/configs/common/SysPaths.py	Wed Dec 02 17:27:01 2015 -0600
@@ -50,7 +50,7 @@
         try:
                 path = env['M5_PATH'].split(':')
         except KeyError:
-                path = [ '/dist/m5/system', '/n/poolfs/z/dist/m5/system' ]
+                path = [ '/p/multifacet/users/lena/gem5-gpu/' ]
 
         for system.dir in path:
             if os.path.isdir(system.dir):
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098822 21600
# Node ID 881ecce573ae277a5ef58ad475a36b928138d6d5
# Parent  755143994611e3d42599f4b152e76a76ac8090b7
Ruby: Add parameter to sequencer to control if it can restore from checkpoints

diff -r 755143994611 -r 881ecce573ae src/mem/ruby/system/Sequencer.cc
--- a/src/mem/ruby/system/Sequencer.cc	Wed Dec 02 17:27:01 2015 -0600
+++ b/src/mem/ruby/system/Sequencer.cc	Wed Dec 02 17:27:02 2015 -0600
@@ -70,6 +70,8 @@
     assert(m_dataCache_ptr != NULL);
 
     m_usingNetworkTester = p->using_network_tester;
+
+    m_canRestore = p->can_restore;
 }
 
 Sequencer::~Sequencer()
diff -r 755143994611 -r 881ecce573ae src/mem/ruby/system/Sequencer.hh
--- a/src/mem/ruby/system/Sequencer.hh	Wed Dec 02 17:27:01 2015 -0600
+++ b/src/mem/ruby/system/Sequencer.hh	Wed Dec 02 17:27:02 2015 -0600
@@ -149,6 +149,9 @@
     Stats::Counter getIncompleteTimes(const MachineType t) const
     { return m_IncompleteTimes[t]; }
 
+    bool canRestore() const
+    { return m_canRestore; }
+
   private:
     void issueRequest(PacketPtr pkt, RubyRequestType type);
 
@@ -195,6 +198,8 @@
 
     bool m_usingNetworkTester;
 
+    bool m_canRestore;
+
     //! Histogram for number of outstanding requests per cycle.
     Stats::Histogram m_outstandReqHist;
 
diff -r 755143994611 -r 881ecce573ae src/mem/ruby/system/Sequencer.py
--- a/src/mem/ruby/system/Sequencer.py	Wed Dec 02 17:27:01 2015 -0600
+++ b/src/mem/ruby/system/Sequencer.py	Wed Dec 02 17:27:02 2015 -0600
@@ -74,6 +74,7 @@
     # are unable to handle IO messages. This is a stop-gap fix until
     # further decisions about GPU, copy engine IO capabilities are made
     connect_to_io = Param.Bool(True, "Whether to connect to IO")
+    can_restore = Param.Bool(True, "true if this Sequencer can restore ckpt")
 
 class DMASequencer(RubyPort):
     type = 'DMASequencer'
diff -r 755143994611 -r 881ecce573ae src/mem/ruby/system/System.cc
--- a/src/mem/ruby/system/System.cc	Wed Dec 02 17:27:01 2015 -0600
+++ b/src/mem/ruby/system/System.cc	Wed Dec 02 17:27:02 2015 -0600
@@ -37,6 +37,7 @@
 #include "debug/RubySystem.hh"
 #include "mem/ruby/common/Address.hh"
 #include "mem/ruby/network/Network.hh"
+#include "mem/ruby/system/Sequencer.hh"
 #include "mem/ruby/system/System.hh"
 #include "sim/eventq.hh"
 #include "sim/simulate.hh"
@@ -105,6 +106,7 @@
   m_abs_cntrl_vec.push_back(cntrl);
 
   MachineID id = cntrl->getMachineID();
+
   g_abs_controls[id.getType()][id.getNum()] = cntrl;
 }
 
@@ -320,8 +322,13 @@
     vector<Sequencer*> sequencer_map;
     Sequencer* t = NULL;
     for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
-        sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer());
-        if (t == NULL) t = sequencer_map[cntrl];
+        Sequencer *s = m_abs_cntrl_vec[cntrl]->getSequencer();
+        if (s != NULL && s->canRestore()) {
+            sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer());
+            if (t == NULL) t = sequencer_map[cntrl];
+        } else {
+            sequencer_map.push_back(NULL);
+        }
     }
 
     assert(t != NULL);
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098822 21600
# Node ID fbdb503894fb81808370b17788136f437b04e29c
# Parent  881ecce573ae277a5ef58ad475a36b928138d6d5
imported patch sequencer_change

diff -r 881ecce573ae -r fbdb503894fb src/mem/ruby/system/CacheRecorder.cc
--- a/src/mem/ruby/system/CacheRecorder.cc	Wed Dec 02 17:27:02 2015 -0600
+++ b/src/mem/ruby/system/CacheRecorder.cc	Wed Dec 02 17:27:02 2015 -0600
@@ -130,7 +130,12 @@
             Packet *pkt = new Packet(req, requestType);
             pkt->dataStatic(traceRecord->m_data + rec_bytes_read);
 
-            Sequencer* m_sequencer_ptr = m_seq_map[traceRecord->m_cntrl_id];
+            int id = traceRecord->m_cntrl_id;
+            if (id >= m_seq_map.size()) {
+                id = 0;
+            }
+
+            Sequencer* m_sequencer_ptr = m_seq_map[id];
             assert(m_sequencer_ptr != NULL);
             m_sequencer_ptr->makeRequest(pkt);
         }
# HG changeset patch
# User Lena Olson <lena@cs.wisc.edu>
# Date 1449098822 21600
# Node ID e1974fafe9880d91a2659c00fe16f26ccaf57240
# Parent  fbdb503894fb81808370b17788136f437b04e29c
imported patch add_vaddr_to_cache

diff -r fbdb503894fb -r e1974fafe988 src/mem/protocol/MOESI_hammer-msg.sm
--- a/src/mem/protocol/MOESI_hammer-msg.sm	Wed Dec 02 17:27:02 2015 -0600
+++ b/src/mem/protocol/MOESI_hammer-msg.sm	Wed Dec 02 17:27:02 2015 -0600
@@ -94,6 +94,7 @@
   NetDest Destination,             desc="Multicast destination mask";
   MessageSizeType MessageSize, desc="size category of the message";
   bool DirectedProbe, default="false", desc="probe filter directed probe";
+  Address VAddr,            desc="Virtual address for this request";
 
   Cycles InitialRequestTime, default="Cycles(0)",
         desc="time the initial requests was sent from the L1Cache";
diff -r fbdb503894fb -r e1974fafe988 src/mem/ruby/slicc_interface/RubyRequest.hh
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh	Wed Dec 02 17:27:02 2015 -0600
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh	Wed Dec 02 17:27:02 2015 -0600
@@ -42,6 +42,7 @@
 {
   public:
     Address m_PhysicalAddress;
+    Address m_VirtualAddress;
     Address m_LineAddress;
     RubyRequestType m_Type;
     Address m_ProgramCounter;
@@ -52,12 +53,13 @@
     PacketPtr pkt;
     unsigned m_contextId;
 
-    RubyRequest(Tick curTime, uint64_t _paddr, uint8_t* _data, int _len,
+    RubyRequest(Tick curTime, uint64_t _paddr, uint64_t _vaddr, uint8_t* _data, int _len,
         uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
         PacketPtr _pkt, PrefetchBit _pb = PrefetchBit_No,
         unsigned _proc_id = 100)
         : Message(curTime),
           m_PhysicalAddress(_paddr),
+          m_VirtualAddress(_vaddr),
           m_Type(_type),
           m_ProgramCounter(_pc),
           m_AccessMode(_access_mode),
@@ -81,6 +83,7 @@
     const RubyAccessMode& getAccessMode() const { return m_AccessMode; }
     const int& getSize() const { return m_Size; }
     const PrefetchBit& getPrefetch() const { return m_Prefetch; }
+    const Address& getVirtualAddress() const { return m_VirtualAddress; }
 
     void
     writeData(DataBlock& block) const
diff -r fbdb503894fb -r e1974fafe988 src/mem/ruby/system/Sequencer.cc
--- a/src/mem/ruby/system/Sequencer.cc	Wed Dec 02 17:27:02 2015 -0600
+++ b/src/mem/ruby/system/Sequencer.cc	Wed Dec 02 17:27:02 2015 -0600
@@ -702,7 +702,17 @@
         pc = pkt->req->getPC();
     }
 
+    Addr vaddr = 0;
+    // get a valid virtual address for the gpu caches
+    if (pkt->req->hasVaddr()){
+        vaddr = pkt->req->getVaddr();
+    }
+    //else {
+    //    printf("Packet missing vaddr has pc %lx and paddr %lx\n", pc, pkt->getAddr());
+    //}
+
     RubyRequest *msg = new RubyRequest(clockEdge(), pkt->getAddr(),
+                                       vaddr,
                                        pkt->getPtr<uint8_t>(true),
                                        pkt->getSize(), pc, secondary_type,
                                        RubyAccessMode_Supervisor, pkt,