From bf61bd127f88eea200ae306fb20caf02c03332aa Mon Sep 17 00:00:00 2001
From: Junshi Wang <junshi.wang@arm.com>
Date: Fri, 16 Aug 2024 21:03:10 +0800
Subject: [PATCH 01/47] arch-arm: Add support of AArch32 VCVTA/P/N/M
 instructions.

Add decoder and function of AArch32 VCVTA, VCVTP, VCVTN and VCVTM
instructions. Support both 16-bit and 32-bit variants.

Only support A32 encoding.

Change-Id: I6ece0e1b779f9a7cc9d709894a49a7fdcda28373
Reviewed-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
---
 src/arch/arm/isa/formats/fp.isa | 146 +++++++++++++++++++++++++++++++-
 src/arch/arm/isa/insts/neon.isa | 124 ++++++++++++++++++++++++++-
 2 files changed, 268 insertions(+), 2 deletions(-)

diff --git a/src/arch/arm/isa/formats/fp.isa b/src/arch/arm/isa/formats/fp.isa
index c8508e16e1..45b0985838 100644
--- a/src/arch/arm/isa/formats/fp.isa
+++ b/src/arch/arm/isa/formats/fp.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010-2011, 2016-2019 ARM Limited
+// Copyright (c) 2010-2011, 2016-2019, 2024 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -1891,6 +1891,150 @@ let {{
                         return new NVrsqrteD<uint32_t>(machInst, vd, vm);
                     }
                 }
+            } else if ((b & 0x1c) == 0x00) {
+                if (bits(b, 1)) {
+                    switch(size) {
+                      case 1:
+                        if (q) {
+                            return new NVcvt2uhAQ<uint16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2uhAD<uint16_t>(machInst, vd, vm);
+                        }
+                      case 2:
+                        if (q) {
+                            return new NVcvt2usAQ<uint32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2usAD<uint32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                } else {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2shAQ<int16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2shAD<int16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2ssAQ<int32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2ssAD<int32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                }
+            } else if ((b & 0x1c) == 0x04) {
+                if (bits(b, 1)) {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2uhNQ<uint16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2uhND<uint16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2usNQ<uint32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2usND<uint32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                } else {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2shNQ<int16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2shND<int16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2ssNQ<int32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2ssND<int32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                }
+            } else if ((b & 0x1c) == 0x08) {
+                if (bits(b, 1)) {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2uhPQ<uint16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2uhPD<uint16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2usPQ<uint32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2usPD<uint32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                } else {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2shPQ<int16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2shPD<int16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2ssPQ<int32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2ssPD<int32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                }
+            } else if ((b & 0x1c) == 0x0c) {
+                if (bits(b, 1)) {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2uhMQ<uint16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2uhMD<uint16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2usMQ<uint32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2usMD<uint32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                } else {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2shMQ<int16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2shMD<int16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2ssMQ<int32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2ssMD<int32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                }
             } else {
                 return new Unknown(machInst);
             }
diff --git a/src/arch/arm/isa/insts/neon.isa b/src/arch/arm/isa/insts/neon.isa
index 5f39e48cce..04d6929ae0 100644
--- a/src/arch/arm/isa/insts/neon.isa
+++ b/src/arch/arm/isa/insts/neon.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010-2011, 2015, 2019 ARM Limited
+// Copyright (c) 2010-2011, 2015, 2019, 2024 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -3579,6 +3579,128 @@ let {{
     '''
     twoRegLongMiscInst("vcvt", "NVcvth2s", "SimdCvtOp", ("uint16_t",), vcvth2sCode)
 
+    vcvthp2hCode = '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
+        float mid = vcvtFpHFpS(fpscr, fpscr.dn, fpscr.ahp, srcElem1);
+        if (flushToZero(mid))
+            fpscr.idc = 1;
+        destElem = vfpFpToFixed<float>(mid, %s, 16, 0, true, %s);
+        __asm__ __volatile__("" :: "m" (destElem));
+        finishVfp(fpscr, state, true);
+        FpscrExc = fpscr;
+    '''
+
+    vcvtahp2uhCode = vcvthp2hCode % ("false", "VfpRoundAway")
+    twoRegMiscInst("vcvta.u16.f16", "NVcvt2uhAD", "SimdCvtOp",
+                   ("uint16_t",), 2, vcvtahp2uhCode)
+    twoRegMiscInst("vcvta.u16.f16", "NVcvt2uhAQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vcvtahp2uhCode)
+
+    vcvtnhp2uhCode = vcvthp2hCode % ("false", "VfpRoundNearest")
+    twoRegMiscInst("vcvtn.u16.f16", "NVcvt2uhND", "SimdCvtOp",
+                   ("uint16_t",), 2, vcvtnhp2uhCode)
+    twoRegMiscInst("vcvtn.u16.f16", "NVcvt2uhNQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vcvtnhp2uhCode)
+
+    vcvtphp2uhCode = vcvthp2hCode % ("false", "VfpRoundUpward")
+    twoRegMiscInst("vcvtp.u16.f16", "NVcvt2uhPD", "SimdCvtOp",
+                   ("uint16_t",), 2, vcvtphp2uhCode)
+    twoRegMiscInst("vcvtp.u16.f16", "NVcvt2uhPQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vcvtphp2uhCode)
+
+    vcvtmhp2uhCode = vcvthp2hCode % ("false", "VfpRoundDown")
+    twoRegMiscInst("vcvtm.u16.f16", "NVcvt2uhMD", "SimdCvtOp",
+                   ("uint16_t",), 2, vcvtmhp2uhCode)
+    twoRegMiscInst("vcvtm.u16.f16", "NVcvt2uhMQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vcvtmhp2uhCode)
+
+    vcvtahp2shCode = vcvthp2hCode % ("true", "VfpRoundAway")
+    twoRegMiscInst("vcvta.s16.f16", "NVcvt2shAD", "SimdCvtOp",
+                   ("int16_t",), 2, vcvtahp2shCode)
+    twoRegMiscInst("vcvta.s16.f16", "NVcvt2shAQ", "SimdCvtOp",
+                   ("int16_t",), 4, vcvtahp2shCode)
+
+    vcvtnhp2shCode = vcvthp2hCode % ("true", "VfpRoundNearest")
+    twoRegMiscInst("vcvtn.s16.f16", "NVcvt2shND", "SimdCvtOp",
+                   ("int16_t",), 2, vcvtnhp2shCode)
+    twoRegMiscInst("vcvtn.s16.f16", "NVcvt2shNQ", "SimdCvtOp",
+                   ("int16_t",), 4, vcvtnhp2shCode)
+
+    vcvtphp2shCode = vcvthp2hCode % ("true", "VfpRoundUpward")
+    twoRegMiscInst("vcvtp.s16.f16", "NVcvt2shPD", "SimdCvtOp",
+                   ("int16_t",), 2, vcvtphp2shCode)
+    twoRegMiscInst("vcvtp.s16.f16", "NVcvt2shPQ", "SimdCvtOp",
+                   ("int16_t",), 4, vcvtphp2shCode)
+
+    vcvtmhp2shCode = vcvthp2hCode % ("true", "VfpRoundDown")
+    twoRegMiscInst("vcvtm.s16.f16", "NVcvt2shMD", "SimdCvtOp",
+                   ("int16_t",), 2, vcvtmhp2shCode)
+    twoRegMiscInst("vcvtm.s16.f16", "NVcvt2shMQ", "SimdCvtOp",
+                   ("int16_t",), 4, vcvtmhp2shCode)
+
+    vcvtsp2sCode = '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
+        float mid = bitsToFp(srcElem1, (float)0.0);
+        if (flushToZero(mid))
+            fpscr.idc = 1;
+        destElem = vfpFpToFixed<float>(mid, %s, 32, 0, true, %s);
+        __asm__ __volatile__("" :: "m" (destElem));
+        finishVfp(fpscr, state, true);
+        FpscrExc = fpscr;
+    '''
+
+    vcvtasp2usCode = vcvtsp2sCode % ("false", "VfpRoundAway")
+    twoRegMiscInst("vcvta.u32.f32", "NVcvt2usAD", "SimdCvtOp",
+                   ("uint32_t",), 2, vcvtasp2usCode)
+    twoRegMiscInst("vcvta.u32.f32", "NVcvt2usAQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vcvtasp2usCode)
+
+    vcvtnsp2usCode = vcvtsp2sCode % ("false", "VfpRoundNearest")
+    twoRegMiscInst("vcvtn.u32.f32", "NVcvt2usND", "SimdCvtOp",
+                   ("uint32_t",), 2, vcvtnsp2usCode)
+    twoRegMiscInst("vcvtn.u32.f32", "NVcvt2usNQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vcvtnsp2usCode)
+
+    vcvtpsp2usCode = vcvtsp2sCode % ("false", "VfpRoundUpward")
+    twoRegMiscInst("vcvtp.u32.f32", "NVcvt2usPD", "SimdCvtOp",
+                   ("uint32_t",), 2, vcvtpsp2usCode)
+    twoRegMiscInst("vcvtp.u32.f32", "NVcvt2usPQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vcvtpsp2usCode)
+
+    vcvtmsp2usCode = vcvtsp2sCode % ("false", "VfpRoundDown")
+    twoRegMiscInst("vcvtm.u32.f32", "NVcvt2usMD", "SimdCvtOp",
+                   ("uint32_t",), 2, vcvtmsp2usCode)
+    twoRegMiscInst("vcvtm.u32.f32", "NVcvt2usMQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vcvtmsp2usCode)
+
+    vcvtasp2ssCode = vcvtsp2sCode % ("true", "VfpRoundAway")
+    twoRegMiscInst("vcvta.s32.f32", "NVcvt2ssAD", "SimdCvtOp",
+                   ("int32_t",), 2, vcvtasp2ssCode)
+    twoRegMiscInst("vcvta.s32.f32", "NVcvt2ssAQ", "SimdCvtOp",
+                   ("int32_t",), 4, vcvtasp2ssCode)
+
+    vcvtnsp2ssCode = vcvtsp2sCode % ("true", "VfpRoundNearest")
+    twoRegMiscInst("vcvtn.s32.f32", "NVcvt2ssND", "SimdCvtOp",
+                   ("int32_t",), 2, vcvtnsp2ssCode)
+    twoRegMiscInst("vcvtn.s32.f32", "NVcvt2ssNQ", "SimdCvtOp",
+                   ("int32_t",), 4, vcvtnsp2ssCode)
+
+    vcvtpsp2ssCode = vcvtsp2sCode % ("true", "VfpRoundUpward")
+    twoRegMiscInst("vcvtp.s32.f32", "NVcvt2ssPD", "SimdCvtOp",
+                   ("int32_t",), 2, vcvtpsp2ssCode)
+    twoRegMiscInst("vcvtp.s32.f32", "NVcvt2ssPQ", "SimdCvtOp",
+                   ("int32_t",), 4, vcvtpsp2ssCode)
+
+    vcvtmsp2ssCode = vcvtsp2sCode % ("true", "VfpRoundDown")
+    twoRegMiscInst("vcvtm.s32.f32", "NVcvt2ssMD", "SimdCvtOp",
+                   ("int32_t",), 2, vcvtmsp2ssCode)
+    twoRegMiscInst("vcvtm.s32.f32", "NVcvt2ssMQ", "SimdCvtOp",
+                   ("int32_t",), 4, vcvtmsp2ssCode)
+
     vrsqrteCode = '''
         destElem = unsignedRSqrtEstimate(srcElem1);
     '''

From be49bf89c028c283b02d0cc2ce5653a0cbd98856 Mon Sep 17 00:00:00 2001
From: "Tommaso Marinelli (imec)" <Tommaso.Marinelli@imec.be>
Date: Wed, 2 Oct 2024 16:16:19 +0200
Subject: [PATCH 02/47] arch-riscv: Enable clone3 syscall in riscv64

The clone3 syscall, implemented in commit 87e774c, is currently only
handled for x86-64 in gem5. Clone3 is employed by modern glibc versions
instead of clone for processes/threads generation (e.g. issue #1204).
This commit enables the clone3 syscall in riscv64 by adding the
corresponding handler call, as well as its arguments struct.
---
 src/arch/riscv/linux/linux.hh       | 15 +++++++++++++++
 src/arch/riscv/linux/se_workload.cc |  1 +
 2 files changed, 16 insertions(+)

diff --git a/src/arch/riscv/linux/linux.hh b/src/arch/riscv/linux/linux.hh
index 997eb6af4c..b2fbdd29f3 100644
--- a/src/arch/riscv/linux/linux.hh
+++ b/src/arch/riscv/linux/linux.hh
@@ -195,6 +195,21 @@ class RiscvLinux64 : public RiscvLinux, public OpenFlagTable<RiscvLinux64>
         uint32_t mem_unit;
     };
 
+    struct tgt_clone_args
+    {
+        uint64_t flags;
+        uint64_t pidfd;
+        uint64_t child_tid;
+        uint64_t parent_tid;
+        uint64_t exit_signal;
+        uint64_t stack;
+        uint64_t stack_size;
+        uint64_t tls;
+        uint64_t set_tid;
+        uint64_t set_tid_size;
+        uint64_t cgroup;
+    };
+
     static void
     archClone(uint64_t flags,
               Process *pp, Process *cp,
diff --git a/src/arch/riscv/linux/se_workload.cc b/src/arch/riscv/linux/se_workload.cc
index c1af16fb3b..77d262ede6 100644
--- a/src/arch/riscv/linux/se_workload.cc
+++ b/src/arch/riscv/linux/se_workload.cc
@@ -410,6 +410,7 @@ SyscallDescTable<SEWorkload::SyscallABI64> EmuLinux::syscallDescs64 = {
     { 285,  "copy_file_range" },
     { 286,  "preadv2" },
     { 287,  "pwritev2" },
+    { 435,  "clone3", clone3Func<RiscvLinux64> },
     { 1024, "open", openFunc<RiscvLinux64> },
     { 1025, "link", linkFunc },
     { 1026, "unlink", unlinkFunc },

From 242c0e96930355f53ad965ce043336d947c1339f Mon Sep 17 00:00:00 2001
From: Tommaso Marinelli <tommarin@ucm.es>
Date: Thu, 3 Oct 2024 03:25:39 +0200
Subject: [PATCH 03/47] arch-riscv: Add more syscall placeholders

---
 src/arch/riscv/linux/se_workload.cc | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/arch/riscv/linux/se_workload.cc b/src/arch/riscv/linux/se_workload.cc
index 77d262ede6..6caec283ed 100644
--- a/src/arch/riscv/linux/se_workload.cc
+++ b/src/arch/riscv/linux/se_workload.cc
@@ -410,7 +410,33 @@ SyscallDescTable<SEWorkload::SyscallABI64> EmuLinux::syscallDescs64 = {
     { 285,  "copy_file_range" },
     { 286,  "preadv2" },
     { 287,  "pwritev2" },
+    { 424,  "pidfd_send_signal" },
+    { 425,  "io_uring_setup" },
+    { 426,  "io_uring_enter" },
+    { 427,  "io_uring_register" },
+    { 428,  "open_tree" },
+    { 429,  "move_mount" },
+    { 430,  "fsopen" },
+    { 431,  "fsconfig" },
+    { 432,  "fsmount" },
+    { 433,  "fspick" },
+    { 434,  "pidfd_open" },
     { 435,  "clone3", clone3Func<RiscvLinux64> },
+    { 436,  "close_range" },
+    { 437,  "openat2" },
+    { 438,  "pidfd_getfd" },
+    { 439,  "faccessat2" },
+    { 440,  "process_madvise" },
+    { 441,  "epoll_pwait2" },
+    { 442,  "mount_setattr" },
+    { 443,  "quotactl_fd" },
+    { 444,  "landlock_create_ruleset" },
+    { 445,  "landlock_add_rule" },
+    { 446,  "landlock_restrict_self" },
+    { 447,  "memfd_secret" },
+    { 448,  "process_mrelease" },
+    { 449,  "futex_waitv" },
+    { 450,  "set_mempolicy_home_node" },
     { 1024, "open", openFunc<RiscvLinux64> },
     { 1025, "link", linkFunc },
     { 1026, "unlink", unlinkFunc },

From 4b3ba1daa628b14213c3b42c853f62268db58ebd Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Thu, 22 Aug 2024 05:08:19 -0700
Subject: [PATCH 04/47] stdlib: Deprecate Simulator 'full_system' param

THis is deprecated in favor of the board determining whether the
simulation is FS or SE. Usually this will be contingent on which
`set_workload` funciton has been called. Regardless, it is the board's
responsibility. The user should not need to explicitly declare this any
longer.
---
 src/python/gem5/simulate/simulator.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/python/gem5/simulate/simulator.py b/src/python/gem5/simulate/simulator.py
index 49dfac2bdf..59385fe17d 100644
--- a/src/python/gem5/simulate/simulator.py
+++ b/src/python/gem5/simulate/simulator.py
@@ -117,6 +117,10 @@ class Simulator:
                             behavior. If not set, whether or not to run in FS
                             mode will be determined via the board's
                             ``is_fullsystem()`` function.
+                            **Warning: This parameter is deprecated. The board
+                            determines if the simulation is full system or not.
+                            This parameter will be removed in a future gem5
+                            release.**
         :param on_exit_event: An optional map to specify what to execute on
                               each exit event. There are three possibilities here:
                               a generator, a list of functions, or a single function.
@@ -291,6 +295,15 @@ class Simulator:
 
         """
 
+        if full_system is not None:
+            warn(
+                "Setting the full_system parameter via the Simulator "
+                "constructor is deprecated and will be removed in future "
+                "releases of gem5. "
+                "The board determines if the simulation is full system or not "
+                "via it's `is_fullsystem` method."
+            )
+
         self.set_max_ticks(max_ticks)
 
         if id:

From 4bdcb040d0cc94c8f6e609d5c584228a749a9f15 Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Thu, 22 Aug 2024 05:16:47 -0700
Subject: [PATCH 05/47] stdlib: Move Root obj creation from Simulator to Board

It makes much more sense for the Root Object to be create within the
board and passed where required. Creating it in the Simulator class is
not required.

For this to work the signuature of the `_pre_instantiate` function in
`AbstractBoard` has been updated to return the Root object.
---
 configs/example/lupv/run_lupv.py              |  3 +--
 .../gem5/components/boards/abstract_board.py  | 25 +++++++++++++++++--
 .../gem5/components/boards/arm_board.py       |  5 ++--
 .../gem5/components/boards/riscv_board.py     |  9 ++++---
 .../riscvmatched/riscvmatched_board.py        |  4 +--
 src/python/gem5/simulate/simulator.py         | 18 +++----------
 .../configs/boot_kvm_fork_run.py              |  8 +++---
 .../configs/run_replacement_policy.py         |  3 +--
 .../run_replacement_policy.py                 |  3 +--
 .../traffic_gen/configs/simple_traffic_run.py |  4 +--
 10 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/configs/example/lupv/run_lupv.py b/configs/example/lupv/run_lupv.py
index f6f938b16c..57bf6ca6b8 100644
--- a/configs/example/lupv/run_lupv.py
+++ b/configs/example/lupv/run_lupv.py
@@ -110,8 +110,7 @@ board.set_kernel_disk_workload(
 # Begin running of the simulation.
 print("Running with ISA: " + processor.get_isa().name)
 print()
-root = Root(full_system=True, system=board)
-board._pre_instantiate()
+root = board._pre_instantiate()
 m5.instantiate()
 print("Beginning simulation!")
 
diff --git a/src/python/gem5/components/boards/abstract_board.py b/src/python/gem5/components/boards/abstract_board.py
index 83ca32d9c0..34308a1e11 100644
--- a/src/python/gem5/components/boards/abstract_board.py
+++ b/src/python/gem5/components/boards/abstract_board.py
@@ -41,6 +41,7 @@ from m5.objects import (
     ClockDomain,
     IOXBar,
     Port,
+    Root,
     SrcClockDomain,
     System,
     VoltageDomain,
@@ -391,13 +392,33 @@ class AbstractBoard:
             self.get_cache_hierarchy()._post_instantiate()
         self.get_memory()._post_instantiate()
 
-    def _pre_instantiate(self):
+    def _pre_instantiate(self, full_system: Optional[bool] = None) -> Root:
         """To be called immediately before ``m5.instantiate``. This is where
-        ``_connect_things`` is executed by default."""
+        ``_connect_things`` is executed by default and the root object is Root
+        object is created and returned.
+
+        :param full_system: Used to pass the full system flag to the board from
+                            the Simulator module. **Note**: This was
+                            implemented solely to maintain backawards
+                            compatibility with while the Simululator module's
+                            `full_system` flag is in state of deprecation. This
+                            parameter will be removed when it is. When this
+                            occurs whether a simulation is to be run in FS or
+                            SE mode will be determined by the board set."""
 
         # Connect the memory, processor, and cache hierarchy.
         self._connect_things()
 
+        # Return the Root object.
+        return Root(
+            full_system=(
+                full_system
+                if full_system is not None
+                else self.is_fullsystem()
+            ),
+            board=self,
+        )
+
     def _connect_things_check(self):
         """
         Here we check that connect things has been called and throw an
diff --git a/src/python/gem5/components/boards/arm_board.py b/src/python/gem5/components/boards/arm_board.py
index 0a0cd2fa28..e8907a99c9 100644
--- a/src/python/gem5/components/boards/arm_board.py
+++ b/src/python/gem5/components/boards/arm_board.py
@@ -28,6 +28,7 @@ import os
 from abc import ABCMeta
 from typing import (
     List,
+    Optional,
     Sequence,
     Tuple,
 )
@@ -327,8 +328,8 @@ class ArmBoard(ArmSystem, AbstractBoard, KernelDiskWorkload):
         self.system_port = port
 
     @overrides(AbstractBoard)
-    def _pre_instantiate(self):
-        super()._pre_instantiate()
+    def _pre_instantiate(self, full_system: Optional[bool] = None) -> None:
+        super()._pre_instantiate(full_system=full_system)
 
         # Add the PCI devices.
         self.pci_devices = self._pci_devices
diff --git a/src/python/gem5/components/boards/riscv_board.py b/src/python/gem5/components/boards/riscv_board.py
index e8e27029f2..e14833c996 100644
--- a/src/python/gem5/components/boards/riscv_board.py
+++ b/src/python/gem5/components/boards/riscv_board.py
@@ -26,7 +26,10 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
-from typing import List
+from typing import (
+    List,
+    Optional,
+)
 
 import m5
 from m5.objects import (
@@ -498,7 +501,7 @@ class RiscvBoard(AbstractSystemBoard, KernelDiskWorkload):
         return "/dev/vda"
 
     @overrides(AbstractSystemBoard)
-    def _pre_instantiate(self):
+    def _pre_instantiate(self, full_system: Optional[bool] = None):
         if len(self._bootloader) > 0:
             self.workload.bootloader_addr = 0x0
             self.workload.bootloader_filename = self._bootloader[0]
@@ -507,7 +510,7 @@ class RiscvBoard(AbstractSystemBoard, KernelDiskWorkload):
         else:
             self.workload.kernel_addr = 0x0
             self.workload.entry_point = 0x80000000
-        self._connect_things()
+        super()._pre_instantiate(full_system=full_system)
 
     @overrides(KernelDiskWorkload)
     def _add_disk_to_board(self, disk_image: AbstractResource):
diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
index 23a7dcc8cb..ba9588c725 100644
--- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
+++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
@@ -313,7 +313,7 @@ class RISCVMatchedBoard(
             memory.set_memory_range(self.mem_ranges)
 
     @overrides(AbstractSystemBoard)
-    def _pre_instantiate(self):
+    def _pre_instantiate(self, full_system: Optional[bool] = None) -> None:
         if self._fs:
             if len(self._bootloader) > 0:
                 self.workload.bootloader_addr = 0x0
@@ -326,7 +326,7 @@ class RISCVMatchedBoard(
                 self.workload.kernel_addr = 0x0
                 self.workload.entry_point = 0x80000000
 
-        self._connect_things()
+        super()._pre_instantiate(full_system=full_system)
 
     def generate_device_tree(self, outdir: str) -> None:
         """Creates the ``dtb`` and ``dts`` files.
diff --git a/src/python/gem5/simulate/simulator.py b/src/python/gem5/simulate/simulator.py
index 59385fe17d..a1b75a0523 100644
--- a/src/python/gem5/simulate/simulator.py
+++ b/src/python/gem5/simulate/simulator.py
@@ -664,22 +664,12 @@ class Simulator:
 
         if not self._instantiated:
             # Before anything else we run the AbstractBoard's
-            # `_pre_instantiate` function.
-            self._board._pre_instantiate()
-
-            root = Root(
-                full_system=(
-                    self._full_system
-                    if self._full_system is not None
-                    else self._board.is_fullsystem()
-                ),
-                board=self._board,
+            # `_pre_instantiate` function. This returns the root object which
+            # is required for instantiation.
+            self._root = self._board._pre_instantiate(
+                full_system=self._full_system
             )
 
-            # We take a copy of the Root in case it's required elsewhere
-            # (for example, in `get_stats()`).
-            self._root = root
-
             # The following is a bit of a hack. If a simulation is to use a KVM
             # core then the `sim_quantum` value must be set. However, in the
             # case of using a SwitchableProcessor the KVM cores may be
diff --git a/tests/gem5/kvm_fork_tests/configs/boot_kvm_fork_run.py b/tests/gem5/kvm_fork_tests/configs/boot_kvm_fork_run.py
index 8850a27c75..be6e6009e1 100644
--- a/tests/gem5/kvm_fork_tests/configs/boot_kvm_fork_run.py
+++ b/tests/gem5/kvm_fork_tests/configs/boot_kvm_fork_run.py
@@ -207,15 +207,15 @@ print("Running with ISA: " + processor.get_isa().name)
 print("Running with protocol: " + get_runtime_coherence_protocol().name)
 print()
 
-root = Root(full_system=True, system=motherboard)
+# Disable the gdb ports. Required for forking.
+m5.disableAllListeners()
+root = motherboard._pre_instantiate()
 
 # TODO: This of annoying. Is there a way to fix this to happen
 # automatically when running KVM?
 root.sim_quantum = int(1e9)
 
-# Disable the gdb ports. Required for forking.
-m5.disableAllListeners()
-motherboard._pre_instantiate()
+
 m5.instantiate()
 
 # Simulate the inital boot with the starting KVM cpu
diff --git a/tests/gem5/replacement_policies/configs/run_replacement_policy.py b/tests/gem5/replacement_policies/configs/run_replacement_policy.py
index 8f52a061f6..f7ecdb71de 100644
--- a/tests/gem5/replacement_policies/configs/run_replacement_policy.py
+++ b/tests/gem5/replacement_policies/configs/run_replacement_policy.py
@@ -83,9 +83,8 @@ motherboard = TestBoard(
     memory=memory,
     cache_hierarchy=cache_hierarchy,
 )
-root = Root(full_system=False, system=motherboard)
 
-motherboard._pre_instantiate()
+root = motherboard._pre_instantiate()
 m5.instantiate()
 
 generator.start_traffic()
diff --git a/tests/gem5/replacement_policies/run_replacement_policy.py b/tests/gem5/replacement_policies/run_replacement_policy.py
index 8f52a061f6..f7ecdb71de 100644
--- a/tests/gem5/replacement_policies/run_replacement_policy.py
+++ b/tests/gem5/replacement_policies/run_replacement_policy.py
@@ -83,9 +83,8 @@ motherboard = TestBoard(
     memory=memory,
     cache_hierarchy=cache_hierarchy,
 )
-root = Root(full_system=False, system=motherboard)
 
-motherboard._pre_instantiate()
+root = motherboard._pre_instantiate()
 m5.instantiate()
 
 generator.start_traffic()
diff --git a/tests/gem5/traffic_gen/configs/simple_traffic_run.py b/tests/gem5/traffic_gen/configs/simple_traffic_run.py
index 3a850b497d..7c264cefe9 100644
--- a/tests/gem5/traffic_gen/configs/simple_traffic_run.py
+++ b/tests/gem5/traffic_gen/configs/simple_traffic_run.py
@@ -202,9 +202,7 @@ motherboard = TestBoard(
     cache_hierarchy=cache_hierarchy,
 )
 
-root = Root(full_system=False, system=motherboard)
-
-motherboard._pre_instantiate()
+root = motherboard._pre_instantiate()
 m5.instantiate()
 
 generator.start_traffic()

From b358471eb954660b6fba9be327ee91d62f366761 Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Thu, 22 Aug 2024 05:20:58 -0700
Subject: [PATCH 06/47] stdlib: Move 'sim_quantum' set from Simulator to
 Processor

The setting of the `sim_quantum` parameter makes considerably more sense
to occur in the Processor. Through the `_pre_instnatiate` functions this
is now possible.
---
 .../gem5/components/boards/abstract_board.py  | 13 ++++++++---
 .../processors/abstract_processor.py          | 14 ++++++++++-
 .../processors/base_cpu_processor.py          |  8 +++++++
 .../processors/switchable_processor.py        | 22 ++++++++++++++++++
 src/python/gem5/simulate/simulator.py         | 23 -------------------
 5 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/src/python/gem5/components/boards/abstract_board.py b/src/python/gem5/components/boards/abstract_board.py
index 34308a1e11..af9ff300b6 100644
--- a/src/python/gem5/components/boards/abstract_board.py
+++ b/src/python/gem5/components/boards/abstract_board.py
@@ -406,11 +406,11 @@ class AbstractBoard:
                             occurs whether a simulation is to be run in FS or
                             SE mode will be determined by the board set."""
 
-        # Connect the memory, processor, and cache hierarchy.
+        # 1. Connect the memory, processor, and cache hierarchy.
         self._connect_things()
 
-        # Return the Root object.
-        return Root(
+        # 2. Create the root object
+        root = Root(
             full_system=(
                 full_system
                 if full_system is not None
@@ -419,6 +419,13 @@ class AbstractBoard:
             board=self,
         )
 
+        # 3. Call any of the components' `_pre_instantiate` functions.
+        # Right now, only the processor requires this.
+        self.get_processor()._pre_instantiate(root)
+
+        # 4. Return the root object.
+        return root
+
     def _connect_things_check(self):
         """
         Here we check that connect things has been called and throw an
diff --git a/src/python/gem5/components/processors/abstract_processor.py b/src/python/gem5/components/processors/abstract_processor.py
index 79dba438a2..303b9658f2 100644
--- a/src/python/gem5/components/processors/abstract_processor.py
+++ b/src/python/gem5/components/processors/abstract_processor.py
@@ -33,7 +33,10 @@ from typing import (
     Optional,
 )
 
-from m5.objects import SubSystem
+from m5.objects import (
+    Root,
+    SubSystem,
+)
 
 from ...isas import ISA
 from ...utils.requires import requires
@@ -83,3 +86,12 @@ class AbstractProcessor(SubSystem):
     def _post_instantiate(self) -> None:
         """Called to set up anything needed after ``m5.instantiate``."""
         pass
+
+    def _pre_instantiate(self, root: Root) -> None:
+        """Called in the `AbstractBoard`'s `_pre_instantiate` method. This is
+        called after `connect_things`, after the creation of the root object
+        (which is passed in as an argument), but before `m5.instantiate`).
+
+        Subclasses should override this method to set up any connections.
+        """
+        pass
diff --git a/src/python/gem5/components/processors/base_cpu_processor.py b/src/python/gem5/components/processors/base_cpu_processor.py
index b1a63ea8ce..674148b409 100644
--- a/src/python/gem5/components/processors/base_cpu_processor.py
+++ b/src/python/gem5/components/processors/base_cpu_processor.py
@@ -27,12 +27,14 @@
 
 from typing import List
 
+import m5
 from m5.objects import (
     BaseAtomicSimpleCPU,
     BaseMinorCPU,
     BaseNonCachingSimpleCPU,
     BaseO3CPU,
     BaseTimingSimpleCPU,
+    Root,
 )
 from m5.util import warn
 
@@ -99,3 +101,9 @@ class BaseCPUProcessor(AbstractProcessor):
                 board.set_mem_mode(MemMode.ATOMIC)
         else:
             raise NotImplementedError
+
+    def _pre_instantiate(self, root: Root) -> None:
+        super()._pre_instantiate(root)
+        if any(core.is_kvm_core() for core in self.get_cores()):
+            m5.ticks.fixGlobalFrequency()
+            root.sim_quantum = m5.ticks.fromSeconds(0.001)
diff --git a/src/python/gem5/components/processors/switchable_processor.py b/src/python/gem5/components/processors/switchable_processor.py
index 2436c9e81f..a5a9ae2b6b 100644
--- a/src/python/gem5/components/processors/switchable_processor.py
+++ b/src/python/gem5/components/processors/switchable_processor.py
@@ -31,6 +31,7 @@ from typing import (
 )
 
 import m5
+from m5.objects import Root
 
 from ...utils.override import *
 from ..boards.abstract_board import AbstractBoard
@@ -155,3 +156,24 @@ class SwitchableProcessor(AbstractProcessor):
 
         # Ensure the current processor is updated.
         self._current_cores = to_switch
+
+    def _pre_instantiate(self, root: Root) -> None:
+        super()._pre_instantiate(root)
+        # The following is a bit of a hack. If a simulation is to use a KVM
+        # core then the `sim_quantum` value must be set. However, in the
+        # case of using a SwitchableProcessor the KVM cores may be
+        # switched out and therefore not accessible via `get_cores()`.
+        # This is the reason for the `isinstance` check.
+        #
+        # We cannot set the `sim_quantum` value in every simulation as
+        # setting it causes the scheduling of exits to be off by the
+        # `sim_quantum` value (something necessary if we are using KVM
+        # cores). Ergo we only set the value of KVM cores are present.
+        #
+        # There is still a bug here in that if the user is switching to and
+        # from KVM and non-KVM cores via the SwitchableProcessor then the
+        # scheduling of exits for the non-KVM cores will be incorrect. This
+        # will be fixed at a later date.
+        if self._prepare_kvm:
+            m5.ticks.fixGlobalFrequency()
+            root.sim_quantum = m5.ticks.fromSeconds(0.001)
diff --git a/src/python/gem5/simulate/simulator.py b/src/python/gem5/simulate/simulator.py
index a1b75a0523..ba74361915 100644
--- a/src/python/gem5/simulate/simulator.py
+++ b/src/python/gem5/simulate/simulator.py
@@ -670,29 +670,6 @@ class Simulator:
                 full_system=self._full_system
             )
 
-            # The following is a bit of a hack. If a simulation is to use a KVM
-            # core then the `sim_quantum` value must be set. However, in the
-            # case of using a SwitchableProcessor the KVM cores may be
-            # switched out and therefore not accessible via `get_cores()`.
-            # This is the reason for the `isinstance` check.
-            #
-            # We cannot set the `sim_quantum` value in every simulation as
-            # setting it causes the scheduling of exits to be off by the
-            # `sim_quantum` value (something necessary if we are using KVM
-            # cores). Ergo we only set the value of KVM cores are present.
-            #
-            # There is still a bug here in that if the user is switching to and
-            # from KVM and non-KVM cores via the SwitchableProcessor then the
-            # scheduling of exits for the non-KVM cores will be incorrect. This
-            # will be fixed at a later date.
-            processor = self._board.processor
-            if any(core.is_kvm_core() for core in processor.get_cores()) or (
-                isinstance(processor, SwitchableProcessor)
-                and any(core.is_kvm_core() for core in processor._all_cores())
-            ):
-                m5.ticks.fixGlobalFrequency()
-                root.sim_quantum = m5.ticks.fromSeconds(0.001)
-
             # m5.instantiate() takes a parameter specifying the path to the
             # checkpoint directory. If the parameter is None, no checkpoint
             # will be restored.

From 7c83e3379b169f1dd6ed28ce1780602e88849ffd Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Fri, 4 Oct 2024 11:56:48 -0700
Subject: [PATCH 07/47] stdlib: Add `_pre_instantiate` funcs for caches and
 memory

Note: At present this is not used but these functions can be filled
or overriden in subclasses as required.
---
 .../gem5/components/boards/abstract_board.py    |  4 +++-
 .../abstract_cache_hierarchy.py                 | 17 ++++++++++++++++-
 .../components/memory/abstract_memory_system.py | 13 +++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/python/gem5/components/boards/abstract_board.py b/src/python/gem5/components/boards/abstract_board.py
index af9ff300b6..5819adc9de 100644
--- a/src/python/gem5/components/boards/abstract_board.py
+++ b/src/python/gem5/components/boards/abstract_board.py
@@ -420,8 +420,10 @@ class AbstractBoard:
         )
 
         # 3. Call any of the components' `_pre_instantiate` functions.
-        # Right now, only the processor requires this.
         self.get_processor()._pre_instantiate(root)
+        self.get_memory()._pre_instantiate(root)
+        if self.get_cache_hierarchy():
+            self.get_cache_hierarchy()._pre_instantiate(root)
 
         # 4. Return the root object.
         return root
diff --git a/src/python/gem5/components/cachehierarchies/abstract_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/abstract_cache_hierarchy.py
index b0435543af..dc20c14f70 100644
--- a/src/python/gem5/components/cachehierarchies/abstract_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/abstract_cache_hierarchy.py
@@ -42,7 +42,10 @@ from abc import (
 )
 from typing import Callable
 
-from m5.objects import SubSystem
+from m5.objects import (
+    Root,
+    SubSystem,
+)
 from m5.util.fdthelper import *
 
 from ..boards.abstract_board import AbstractBoard
@@ -139,6 +142,18 @@ class AbstractCacheHierarchy(SubSystem):
         """
         raise NotImplementedError
 
+    def _pre_instantiate(self, root: Root) -> None:
+        """Called in the `AbstractBoard`'s `_pre_instantiate` method. This is
+        called after `connect_things`, after the creation of the root object
+        (which is passed in as an argument), but before `m5.instantiate`).
+
+        Subclasses should override this method to set up any connections.
+
+        At present there is no general task that must be specified here and is
+        default or applicable to all cache hierarchies.
+        """
+        pass
+
     def _post_instantiate(self):
         """Called to set up anything needed after ``m5.instantiate``."""
         pass
diff --git a/src/python/gem5/components/memory/abstract_memory_system.py b/src/python/gem5/components/memory/abstract_memory_system.py
index 06fa60cad8..6d24e724b6 100644
--- a/src/python/gem5/components/memory/abstract_memory_system.py
+++ b/src/python/gem5/components/memory/abstract_memory_system.py
@@ -38,6 +38,7 @@ from m5.objects import (
     AddrRange,
     MemCtrl,
     Port,
+    Root,
     SubSystem,
 )
 
@@ -50,6 +51,18 @@ class AbstractMemorySystem(SubSystem):
     def __init__(self) -> None:
         super().__init__()
 
+    def _pre_instantiate(self, root: Root) -> None:
+        """Called in the `AbstractBoard`'s `_pre_instantiate` method. This is
+        called after `connect_things`, after the creation of the root object
+        (which is passed in as an argument), but before `m5.instantiate`).
+
+        Subclasses should override this method to set up any connections.
+
+        At present there is no general task that must be specified here and is
+        default or applicable to all memory systems.
+        """
+        pass
+
     @abstractmethod
     def incorporate_memory(self, board: AbstractBoard) -> None:
         """This function completes all of the necessary steps to add this

From 5db68114df439b2bf511bdb016ac69a140c92dab Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Mon, 7 Oct 2024 00:50:18 -0700
Subject: [PATCH 08/47] misc,tests: Change Github Action caches to just be
 date-based

Hashing the `src` directory is too costly, with some runners reaching
timeout. Also, as we only have 10GB of cache it makes sense to have
more course grained caching
---
 .github/workflows/daily-tests.yaml  | 22 +++++++++++++++++-----
 .github/workflows/weekly-tests.yaml | 13 +++++++++++--
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/daily-tests.yaml b/.github/workflows/daily-tests.yaml
index 54711ad63d..584cce0d90 100644
--- a/.github/workflows/daily-tests.yaml
+++ b/.github/workflows/daily-tests.yaml
@@ -8,6 +8,14 @@ on:
     workflow_dispatch:
 
 jobs:
+
+    get-date:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Get the current date
+              id: date
+              run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+
   # this builds both unittests.fast and unittests.debug
     unittests-fast-debug:
         strategy:
@@ -16,13 +24,14 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
         timeout-minutes: 60
+        needs: get-date
         steps:
             - uses: actions/checkout@v4
             - name: Cache build/ALL
               uses: actions/cache/restore@v4
               with:
                   path: build/ALL
-                  key: testlib-build-all-${{ hashFiles('src/**') }}
+                  key: testlib-build-all-${{ env.date }}
                   restore-keys: |
                       testlib-build-all
             - name: ALL/unittests.${{ matrix.type }} UnitTests
@@ -38,6 +47,7 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
         timeout-minutes: 1440 # 24 hours for entire matrix to run
+        needs: get-date
         steps:
             - name: Clean runner
               run: rm -rf ./* || true rm -rf ./.??* || true rm -rf ~/.cache || true
@@ -47,13 +57,13 @@ jobs:
               uses: actions/cache@v4
               with:
                   path: build/NULL
-                  key: testlib-build-null-${{ hashFiles('src/**') }}
+                  key: testlib-build-null-${{ env.date }}
 
             - name: Restore build/ALL cache
               uses: actions/cache@v4
               with:
                   path: build/ALL
-                  key: testlib-build-all-${{ hashFiles('src/**') }}
+                  key: testlib-build-all-${{ env.date }}
 
             - name: long ${{ matrix.test-type }} tests
               working-directory: ${{ github.workspace }}/tests
@@ -81,6 +91,7 @@ jobs:
                     gem5-library-example-arm-ubuntu-run-test-ALL-x86_64-opt, gem5-library-example-riscvmatched-hello-ALL-x86_64-opt]
         container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
         timeout-minutes: 1440 # 24 hours
+        needs: get-date
         steps:
             - name: Clean runner
               run: rm -rf ./* || true rm -rf ./.??* || true rm -rf ~/.cache || true
@@ -90,7 +101,7 @@ jobs:
               uses: actions/cache@v4
               with:
                   path: build/ALL
-                  key: testlib-build-all-${{ hashFiles('src/**') }}
+                  key: testlib-build-all-${{ env.date }}
                   restore-keys: |
                       testlib-build-all
 
@@ -113,6 +124,7 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         container: ghcr.io/gem5/gcn-gpu:latest
         timeout-minutes: 720 # 12 hours
+        needs: get-date
 
         steps:
             - uses: actions/checkout@v4
@@ -123,7 +135,7 @@ jobs:
               uses: actions/cache@v4
               with:
                   path: build/VEGA_X86
-                  key: testlib-build-vega-${{ hashFiles('src/**') }}
+                  key: testlib-build-vega-${{ env.date }}
                   restore-keys: |
                       testlib-build-vega
 
diff --git a/.github/workflows/weekly-tests.yaml b/.github/workflows/weekly-tests.yaml
index 7ada70fddb..6baec1fa68 100644
--- a/.github/workflows/weekly-tests.yaml
+++ b/.github/workflows/weekly-tests.yaml
@@ -9,6 +9,13 @@ on:
 
 jobs:
 
+    get-date:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Get the current date
+              id: date
+              run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+
     # start running the very-long tests
     testlib-very-long-tests:
         strategy:
@@ -18,6 +25,7 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
         timeout-minutes: 4320 # 3 days
+        needs: get-date
         steps:
             - name: Clean runner
               run: rm -rf ./* || true rm -rf ./.??* || true rm -rf ~/.cache || true
@@ -27,7 +35,7 @@ jobs:
               uses: actions/cache@v4
               with:
                   path: build/ALL
-                  key: testlib-build-all-${{ hashFiles('src/**') }}
+                  key: testlib-build-all-${{ env.date }}
                   restore-keys: |
                       testlib-build-all
 
@@ -49,6 +57,7 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         container: ghcr.io/gem5/gcn-gpu:latest
         timeout-minutes: 4320 # 3 days
+        needs: get-date
 
         steps:
             - uses: actions/checkout@v4
@@ -59,7 +68,7 @@ jobs:
               uses: actions/cache@v4
               with:
                   path: build/VEGA_X86
-                  key: testlib-build-vega-${{ hashFiles('src/**') }}
+                  key: testlib-build-vega-${{ env.date }}
                   restore-keys: |
                       testlib-build-vega
 

From f5858fe81f8537a708293ebc4d430da3549f5e2f Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Mon, 7 Oct 2024 07:12:07 -0700
Subject: [PATCH 09/47] dev-amdgpu: Deprecate rom and mmio trace params (#1633)

The ROM field was originally intended as a future alternate way to load
VBIOS without the ROM being on the disk image. This code path is never
taken for the devices gem5 supports and there is no gem5 implementation.
Deprecate the rom_binary field for this reason.

Similarly, MMIO traces were only used for Vega10. Deprecate this as
Vega10 is now deprecated. The MMIO trace reader is kept as it may still
be useful in the future. It is still the primary way to handle devies
which have graphics capability. None of the devices supported by gem5
have graphics now that Vega10 is deprecated.
---
 configs/example/gpufs/runfs.py         | 16 +---------------
 configs/example/gpufs/system/amdgpu.py |  2 --
 src/dev/amdgpu/AMDGPU.py               |  2 --
 src/dev/amdgpu/amdgpu_device.cc        | 10 ----------
 4 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index eb95526509..8cb29b07ba 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 Advanced Micro Devices, Inc.
+# Copyright (c) 2021-2024 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -82,10 +82,6 @@ def addRunFSOptions(parser):
         help="The second disk image to mount (/dev/sdb)",
     )
     parser.add_argument("--kernel", default=None, help="Linux kernel to boot")
-    parser.add_argument("--gpu-rom", default=None, help="GPU BIOS to load")
-    parser.add_argument(
-        "--gpu-mmio-trace", default=None, help="GPU MMIO trace to load"
-    )
     parser.add_argument(
         "--checkpoint-before-mmios",
         default=False,
@@ -241,16 +237,6 @@ def runGpuFSSystem(args):
         math.ceil(float(n_cu) / args.cu_per_scalar_cache)
     )
 
-    # Verify MMIO trace is valid. This is only needed for Vega10 simulations.
-    # The md5sum refers to the md5sum of the Vega10 MMIO hardware trace in
-    # the gem5-resources repository. By checking it here, we avoid potential
-    # errors that would cause the driver not to load and simulations to fail.
-    if args.gpu_device == "Vega10":
-        mmio_file = open(args.gpu_mmio_trace, "rb")
-        mmio_md5 = hashlib.md5(mmio_file.read()).hexdigest()
-        if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
-            m5.util.panic("MMIO file does not match gem5 resources")
-
     system = makeGpuFSSystem(args)
 
     root = Root(
diff --git a/configs/example/gpufs/system/amdgpu.py b/configs/example/gpufs/system/amdgpu.py
index dedbcc9324..bdeda9024a 100644
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -176,8 +176,6 @@ def createGPU(system, args):
 def connectGPU(system, args):
     system.pc.south_bridge.gpu = AMDGPUDevice(pci_func=0, pci_dev=8, pci_bus=0)
 
-    system.pc.south_bridge.gpu.trace_file = args.gpu_mmio_trace
-    system.pc.south_bridge.gpu.rom_binary = args.gpu_rom
     system.pc.south_bridge.gpu.checkpoint_before_mmios = (
         args.checkpoint_before_mmios
     )
diff --git a/src/dev/amdgpu/AMDGPU.py b/src/dev/amdgpu/AMDGPU.py
index 35ffcfe528..b1f597aba8 100644
--- a/src/dev/amdgpu/AMDGPU.py
+++ b/src/dev/amdgpu/AMDGPU.py
@@ -81,8 +81,6 @@ class AMDGPUDevice(PciDevice):
     InterruptPin = 2
     ExpansionROM = 0
 
-    rom_binary = Param.String("ROM binary dumped from hardware")
-    trace_file = Param.String("MMIO trace collected on hardware")
     checkpoint_before_mmios = Param.Bool(
         False, "Take a checkpoint before the device begins sending MMIOs"
     )
diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index c82d0de60c..50d152cda1 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -58,12 +58,6 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
       init_interrupt_count(0), _lastVMID(0),
       deviceMem(name() + ".deviceMem", p.memories, false, "", false)
 {
-    // Loading the rom binary dumped from hardware.
-    std::ifstream romBin;
-    romBin.open(p.rom_binary, std::ios::binary);
-    romBin.read((char *)rom.data(), ROM_SIZE);
-    romBin.close();
-
     // System pointer needs to be explicitly set for device memory since
     // DRAMCtrl uses it to get (1) cache line size and (2) the mem mode.
     // Note this means the cache line size is system wide.
@@ -92,10 +86,6 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
         panic("Unknown GPU device %s\n", p.device_name);
     }
 
-    if (p.trace_file != "") {
-        mmioReader.readMMIOTrace(p.trace_file);
-    }
-
     int sdma_id = 0;
     for (auto& s : p.sdmas) {
         s->setGPUDevice(this);

From 1ee924a0677b1917e705990f72668cbb4601ed38 Mon Sep 17 00:00:00 2001
From: "Erin (Jianghua) Le" <ejle@ucdavis.edu>
Date: Mon, 7 Oct 2024 13:45:03 -0700
Subject: [PATCH 10/47] python: clarify SimObject error message (#1625)

This adds more detail to the error message that is thrown when an orphan
node is instantiated.
---
 src/python/m5/SimObject.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/python/m5/SimObject.py b/src/python/m5/SimObject.py
index ce098bea7d..72bf692b6b 100644
--- a/src/python/m5/SimObject.py
+++ b/src/python/m5/SimObject.py
@@ -1259,7 +1259,9 @@ class SimObject(metaclass=MetaSimObject):
         if not self._ccObject:
             # Make sure this object is in the configuration hierarchy
             if not self._parent and not isRoot(self):
-                raise RuntimeError("Attempt to instantiate orphan node")
+                raise RuntimeError(
+                    f"Attempt to instantiate orphan node {self}"
+                )
             # Cycles in the configuration hierarchy are not supported. This
             # will catch the resulting recursion and stop.
             self._ccObject = -1

From 3fc21da13c70273e51744083c90a82b1cd428e23 Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Mon, 7 Oct 2024 14:40:45 -0700
Subject: [PATCH 11/47] learning-gem5,tests: Update learning-gem5 Ruby Test ref
 (#1635)

The Daily tests have been failing as the learning-gem5 Ruby test now
exits at tick 9831 instead of tick 9981.

**Note**: The cause of this change is currently unknown. I'm not sure if
this is symptomatic of something bigger but for now I only observe this
bug failure and this patch at least silences the error.
---
 tests/gem5/learning_gem5/ref/test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/gem5/learning_gem5/ref/test b/tests/gem5/learning_gem5/ref/test
index 309ac2fa40..1e83a06f97 100644
--- a/tests/gem5/learning_gem5/ref/test
+++ b/tests/gem5/learning_gem5/ref/test
@@ -1,3 +1,3 @@
 Global frequency set at 1000000000 ticks per second
 Beginning simulation!
-Exiting @ tick 9981 because Ruby Tester completed
+Exiting @ tick 9831 because Ruby Tester completed

From 440999e447a19db88eeff9d4df4d8f7ba1ddf4e6 Mon Sep 17 00:00:00 2001
From: Giacomo Travaglini <giacomo.travaglini@arm.com>
Date: Tue, 8 Oct 2024 11:22:25 +0200
Subject: [PATCH 12/47] cpu-o3: Add Crypto OpDesc to the O3 Default FU (#1639)

There was a bug exposed by a recent PR [1] where until recently the O3
CPU was executing an instruction even if it did not have the required
functional unit in the FU pool.

We are adding the crypto descriptors to the Default FU pool in the O3
cpu so that no panic is encountered upon executing of a crypto
instruction

[1]: https://github.com/gem5/gem5/pull/1516

Change-Id: Ifaf2f8e4780dfb8ba825a99a02dd587f011dbd23

Reviewed-by: Richard Cooper <richard.cooper@arm.com>

Signed-off-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
---
 src/cpu/o3/FuncUnitConfig.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/cpu/o3/FuncUnitConfig.py b/src/cpu/o3/FuncUnitConfig.py
index ab01b4aa27..d60990fa3a 100644
--- a/src/cpu/o3/FuncUnitConfig.py
+++ b/src/cpu/o3/FuncUnitConfig.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2010, 2017, 2020 ARM Limited
+# Copyright (c) 2010, 2017, 2020, 2024 Arm Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -109,6 +109,14 @@ class SIMD_Unit(FUDesc):
         OpDesc(opClass="SimdExt"),
         OpDesc(opClass="SimdFloatExt"),
         OpDesc(opClass="SimdConfig"),
+        OpDesc(opClass="SimdAes"),
+        OpDesc(opClass="SimdAesMix"),
+        OpDesc(opClass="SimdSha1Hash"),
+        OpDesc(opClass="SimdSha1Hash2"),
+        OpDesc(opClass="SimdSha256Hash"),
+        OpDesc(opClass="SimdSha256Hash2"),
+        OpDesc(opClass="SimdShaSigma2"),
+        OpDesc(opClass="SimdShaSigma3"),
     ]
     count = 4
 

From 4a3e2633d2d90d44b0fe7306f35774b865217fa5 Mon Sep 17 00:00:00 2001
From: Giacomo Travaglini <giacomo.travaglini@arm.com>
Date: Tue, 8 Oct 2024 11:23:14 +0200
Subject: [PATCH 13/47] cpu-o3: Add Matrix OpDesc to the O3 Default FU (#1640)

There was a bug exposed by a recent PR [1] where until recently the O3
CPU was executing an instruction even if it did not have the required
functional unit in the FU pool.

We are adding the matrix descriptors to the Default FU pool in the O3
cpu so that no panic is encountered upon executing of a matrix
instruction

[1]: https://github.com/gem5/gem5/pull/1516

Change-Id: I04250255a2cbb2ee6f3ef204b62bc2c1ee2d4d2c

Reviewed-by: Richard Cooper <richard.cooper@arm.com>

Signed-off-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
---
 src/cpu/o3/FUPool.py         | 3 ++-
 src/cpu/o3/FuncUnitConfig.py | 9 +++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/cpu/o3/FUPool.py b/src/cpu/o3/FUPool.py
index 67f523787b..b82b450700 100644
--- a/src/cpu/o3/FUPool.py
+++ b/src/cpu/o3/FUPool.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 ARM Limited
+# Copyright (c) 2017, 2024 Arm Limited
 # All rights reserved
 #
 # The license below extends only to copyright in the software and shall
@@ -57,6 +57,7 @@ class DefaultFUPool(FUPool):
         FP_MultDiv(),
         ReadPort(),
         SIMD_Unit(),
+        Matrix_Unit(),
         PredALU(),
         WritePort(),
         RdWrPort(),
diff --git a/src/cpu/o3/FuncUnitConfig.py b/src/cpu/o3/FuncUnitConfig.py
index d60990fa3a..5606046f5e 100644
--- a/src/cpu/o3/FuncUnitConfig.py
+++ b/src/cpu/o3/FuncUnitConfig.py
@@ -121,6 +121,15 @@ class SIMD_Unit(FUDesc):
     count = 4
 
 
+class Matrix_Unit(FUDesc):
+    opList = [
+        OpDesc(opClass="Matrix"),
+        OpDesc(opClass="MatrixMov"),
+        OpDesc(opClass="MatrixOP"),
+    ]
+    count = 1
+
+
 class PredALU(FUDesc):
     opList = [OpDesc(opClass="SimdPredAlu")]
     count = 1

From 4f7b3ed82741a6adc198d1b0cf818f6fa2c93bde Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Tue, 8 Oct 2024 08:14:50 -0700
Subject: [PATCH 14/47] mem-ruby: Remove static methods from RubySystem (#1453)

There are several parts to this PR to work towards #1349 .

(1) Make RubySystem::getBlockSizeBytes non-static by providing ways to
access the block size or passing the block size explicitly to classes.

The main changes are:
 - DataBlocks must be explicitly allocated. A default ctor still exists
   to avoid needing to heavily modify SLICC. The size can be set using a
   realloc function, operator=, or copy ctor. This is handled completely
   transparently meaning no protocol or config changes are required.
 - WriteMask now requires block size to be set. This is also handled
   transparently by modifying the SLICC parser to identify WriteMask
   types and call setBlockSize().
 - AbstractCacheEntry and TBE classes now require block size to be set.
   This is handled transparently by modifying the SLICC parser to
   identify these classes and call initBlockSize() which calls
   setBlockSize() for any DataBlock or WriteMask.
 - All AbstractControllers now have a pointer to RubySystem. This is
   assigned in SLICC generated code and requires no changes to protocol
   or configs.
 - The Ruby Message class now requires block size in all constructors.
   This is added to the argument list automatically by the SLICC parser.

(2) Relax dependence on common functions in
src/mem/ruby/common/Address.hh
so that RubySystem::getBlockSizeBits is no longer static. Many classes
already have a way to get block size from the previous commit, so they
simply multiple by 8 to get the number of bits. For handling SLICC and
reducing the number of changes, define makeCacheLine, getOffset, etc. in
RubyPort and AbstractController. The only protocol changes required are
to change any "RubySystem::foo()" calls with "m_ruby_system->foo()".

For classes which do not have a way to get access to block size but
still used makeLineAddress, getOffset, etc., the block size must be
passed to that class. This requires some changes to the SimObject
interface for two commonly used classes: DirectoryMemory and
RubyPrefecther, resulting in user-facing API changes

User-facing API changes:
 - DirectoryMemory and RubyPrefetcher now require the cache line size as
   a non-optional argument.
 - RubySequencer SimObjects now require RubySystem as a non-optional
   argument.
 - TesterThread in the GPU ruby tester now requires the cache line size
   as a non-optional argument.

(3) Removes static member variables in RubySystem which control
randomization, cooldown, and warmup. These are mostly used by the Ruby
Network. The network classes are modified to take these former static
variables as parameters which are passed to the corresponding method
(e.g., enqueue, delayHead, etc.) rather than needing a RubySystem object
at all.

Change-Id: Ia63c2ad5cf0bf9d1cbdffba5d3a679bb4d3b1220

(4) There are two major SLICC generated static methods:
getNumControllers()
on each cache controller which returns the number of controllers created
by the configs at run time and the functions which access this method,
which are MachineType_base_count and MachineType_base_number. These need
to be removed to create multiple RubySystem objects otherwise NetDest,
version value, and other objects are incorrect.

To remove the static requirement, MachineType_base_count and
MachineType_base_number are moved to RubySystem. Any class which needs
to call these methods must now have a pointer to a RubySystem. To enable
that, several changes are made:
 - RubyRequest and Message now require a RubySystem pointer in the
   constructor. The pointer is passed to fields in the Message class
   which require a RubySystem pointer (e.g., NetDest). SLICC is modified
   to do this automatically.
 - SLICC structures may now optionally take an "implicit constructor"
   which can be used to call a non-default constructor for locally
   defined variables (e.g., temporary variables within SLICC actions). A
   statement such as "NetDest bcast_dest;" in SLICC will implicitly
   append a call to the NetDest constructor taking RubySystem, for
   example.
 - RubySystem gets passed to Ruby network objects (Network, Topology).
---
 configs/example/ruby_gpu_random_test.py       |   2 +
 configs/learning_gem5/part3/msi_caches.py     |   5 +-
 .../part3/ruby_caches_MI_example.py           |   5 +-
 configs/learning_gem5/part3/test_caches.py    |   1 +
 configs/ruby/AMD_Base_Constructor.py          |   4 +-
 configs/ruby/GPU_VIPER.py                     |  18 +--
 configs/ruby/MESI_Three_Level.py              |   1 +
 configs/ruby/MESI_Three_Level_HTM.py          |   1 +
 configs/ruby/MESI_Two_Level.py                |   2 +-
 configs/ruby/MOESI_AMD_Base.py                |   8 +-
 configs/ruby/Ruby.py                          |   8 +-
 src/cpu/testers/gpu_ruby_test/TesterThread.py |   1 +
 .../testers/gpu_ruby_test/address_manager.cc  |   4 +-
 src/cpu/testers/gpu_ruby_test/dma_thread.cc   |   6 +-
 .../testers/gpu_ruby_test/gpu_wavefront.cc    |   8 +-
 .../testers/gpu_ruby_test/tester_thread.cc    |  13 +-
 .../testers/gpu_ruby_test/tester_thread.hh    |   2 +
 src/cpu/testers/rubytest/Check.cc             |  18 ++-
 src/cpu/testers/rubytest/Check.hh             |   1 +
 src/cpu/testers/rubytest/RubyTester.hh        |   4 +-
 src/mem/ruby/common/Address.cc                |  22 ++--
 src/mem/ruby/common/Address.hh                |   8 +-
 src/mem/ruby/common/DataBlock.cc              |  91 ++++++++++---
 src/mem/ruby/common/DataBlock.hh              |  21 ++-
 src/mem/ruby/common/NetDest.cc                |  57 +++++++-
 src/mem/ruby/common/NetDest.hh                |  11 ++
 src/mem/ruby/common/SubBlock.cc               |   7 +-
 src/mem/ruby/common/SubBlock.hh               |   3 +-
 src/mem/ruby/common/WriteMask.cc              |   5 +-
 src/mem/ruby/common/WriteMask.hh              |  24 ++++
 src/mem/ruby/network/MessageBuffer.cc         |  12 +-
 src/mem/ruby/network/MessageBuffer.hh         |   9 +-
 src/mem/ruby/network/Network.cc               |  26 +++-
 src/mem/ruby/network/Network.hh               |  11 ++
 src/mem/ruby/network/Topology.cc              |  27 ++--
 src/mem/ruby/network/Topology.hh              |   7 +-
 .../ruby/network/garnet/NetworkInterface.cc   |  15 ++-
 .../ruby/network/garnet/NetworkInterface.hh   |   2 +
 src/mem/ruby/network/simple/PerfectSwitch.cc  |   3 +-
 src/mem/ruby/network/simple/Switch.hh         |   1 +
 src/mem/ruby/network/simple/Throttle.cc       |   4 +-
 src/mem/ruby/profiler/AddressProfiler.cc      |   4 +-
 src/mem/ruby/protocol/GPU_VIPER-SQC.sm        |   2 +-
 src/mem/ruby/protocol/GPU_VIPER-TCP.sm        |   2 +-
 .../ruby/protocol/MESI_Three_Level-L1cache.sm |   2 +-
 .../ruby/protocol/MESI_Two_Level-L1cache.sm   |   2 +-
 .../protocol/MOESI_AMD_Base-RegionBuffer.sm   |   6 +-
 .../ruby/protocol/MOESI_AMD_Base-RegionDir.sm |   6 +-
 src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm   |   2 +-
 .../protocol/MOESI_AMD_Base-probeFilter.sm    |   2 +-
 .../ruby/protocol/MOESI_CMP_directory-dir.sm  |   2 +-
 .../ruby/protocol/MOESI_CMP_token-L1cache.sm  |   2 +-
 src/mem/ruby/protocol/MOESI_CMP_token-dir.sm  |   2 +-
 src/mem/ruby/protocol/RubySlicc_Exports.sm    |   2 +
 src/mem/ruby/protocol/RubySlicc_MemControl.sm |   4 +-
 src/mem/ruby/protocol/RubySlicc_Types.sm      |   2 +-
 src/mem/ruby/protocol/RubySlicc_Util.sm       |   1 +
 src/mem/ruby/protocol/chi/CHI-cache.sm        |   2 +-
 .../ruby/protocol/chi/CHI-dvm-misc-node.sm    |   2 +-
 src/mem/ruby/protocol/chi/CHI-mem.sm          |   2 +-
 .../slicc_interface/AbstractCacheEntry.hh     |  11 +-
 .../slicc_interface/AbstractController.cc     |  56 +++++++-
 .../slicc_interface/AbstractController.hh     |  13 ++
 src/mem/ruby/slicc_interface/Message.hh       |  14 +-
 src/mem/ruby/slicc_interface/RubyRequest.hh   |  41 ++++--
 .../RubySlicc_ComponentMapping.hh             |  17 ---
 .../ruby/slicc_interface/RubySlicc_Util.hh    |  16 ++-
 src/mem/ruby/structures/ALUFreeListArray.cc   |  10 +-
 src/mem/ruby/structures/ALUFreeListArray.hh   |  27 +++-
 src/mem/ruby/structures/BankedArray.cc        |   7 +-
 src/mem/ruby/structures/BankedArray.hh        |   5 +-
 src/mem/ruby/structures/CacheMemory.cc        |  31 +++--
 src/mem/ruby/structures/CacheMemory.hh        |  10 ++
 src/mem/ruby/structures/DirectoryMemory.cc    |   8 +-
 src/mem/ruby/structures/DirectoryMemory.hh    |   3 +
 src/mem/ruby/structures/DirectoryMemory.py    |   4 +
 src/mem/ruby/structures/PerfectCacheMemory.hh |  25 ++--
 src/mem/ruby/structures/PersistentTable.hh    |  14 ++
 src/mem/ruby/structures/RubyCache.py          |   1 -
 src/mem/ruby/structures/RubyPrefetcher.cc     |  13 +-
 src/mem/ruby/structures/RubyPrefetcher.hh     |  14 +-
 src/mem/ruby/structures/RubyPrefetcher.py     |   3 +
 .../ruby/structures/RubyPrefetcherProxy.cc    |  25 +++-
 .../ruby/structures/RubyPrefetcherProxy.hh    |   3 +
 src/mem/ruby/structures/TBETable.hh           |  10 +-
 src/mem/ruby/structures/TimerTable.cc         |   4 +-
 src/mem/ruby/structures/TimerTable.hh         |   8 ++
 src/mem/ruby/structures/WireBuffer.cc         |   4 +-
 src/mem/ruby/structures/WireBuffer.hh         |   5 +-
 src/mem/ruby/structures/WireBuffer.py         |   2 -
 src/mem/ruby/system/CacheRecorder.cc          |  24 ++--
 src/mem/ruby/system/CacheRecorder.hh          |  10 +-
 src/mem/ruby/system/DMASequencer.cc           |  24 ++--
 src/mem/ruby/system/GPUCoalescer.cc           |  20 +--
 src/mem/ruby/system/GPUCoalescer.hh           |   2 +
 src/mem/ruby/system/RubyPort.cc               |  31 ++++-
 src/mem/ruby/system/RubyPort.hh               |   5 +
 src/mem/ruby/system/RubySystem.cc             |  25 +---
 src/mem/ruby/system/RubySystem.hh             |  30 +++--
 src/mem/ruby/system/Sequencer.cc              |  43 +++---
 src/mem/ruby/system/Sequencer.hh              |   2 +
 src/mem/ruby/system/Sequencer.py              |   2 +-
 src/mem/ruby/system/VIPERCoalescer.cc         |  38 +++---
 src/mem/ruby/system/VIPERSequencer.cc         |   4 +-
 src/mem/slicc/ast/CheckProbeStatementAST.py   |   3 +-
 .../slicc/ast/DeferEnqueueingStatementAST.py  |   3 +-
 src/mem/slicc/ast/EnqueueStatementAST.py      |  13 +-
 src/mem/slicc/ast/LocalVariableAST.py         |   2 +
 src/mem/slicc/ast/PeekStatementAST.py         |   3 +-
 src/mem/slicc/symbols/StateMachine.py         |  49 ++++---
 src/mem/slicc/symbols/Type.py                 | 124 ++++++++++++++++--
 .../chi/private_l1_cache_hierarchy.py         |  16 ++-
 .../ruby/caches/mesi_three_level/directory.py |   2 +-
 .../ruby/caches/mesi_three_level/l1_cache.py  |   2 +-
 .../ruby/caches/mesi_three_level/l2_cache.py  |   2 +-
 .../ruby/caches/mesi_two_level/directory.py   |   2 +-
 .../ruby/caches/mesi_two_level/l1_cache.py    |   2 +-
 .../ruby/caches/mi_example/directory.py       |   2 +-
 .../prebuilt/octopi_cache/core_complex.py     |   1 +
 .../caches/prebuilt/octopi_cache/octopi.py    |  10 +-
 .../ruby/mesi_three_level_cache_hierarchy.py  |  12 +-
 .../ruby/mesi_two_level_cache_hierarchy.py    |  15 ++-
 .../ruby/mi_example_cache_hierarchy.py        |  11 +-
 123 files changed, 1066 insertions(+), 399 deletions(-)

diff --git a/configs/example/ruby_gpu_random_test.py b/configs/example/ruby_gpu_random_test.py
index bfcd2c953d..eb7dd3acbd 100644
--- a/configs/example/ruby_gpu_random_test.py
+++ b/configs/example/ruby_gpu_random_test.py
@@ -371,6 +371,7 @@ for dma_idx in range(n_DMAs):
             num_lanes=1,
             clk_domain=thread_clock,
             deadlock_threshold=tester_deadlock_threshold,
+            cache_line_size=system.cache_line_size,
         )
     )
     g_thread_idx += 1
@@ -393,6 +394,7 @@ for cu_idx in range(n_CUs):
                 num_lanes=args.wf_size,
                 clk_domain=thread_clock,
                 deadlock_threshold=tester_deadlock_threshold,
+                cache_line_size=system.cache_line_size,
             )
         )
         g_thread_idx += 1
diff --git a/configs/learning_gem5/part3/msi_caches.py b/configs/learning_gem5/part3/msi_caches.py
index c198662c5e..b719c7ab60 100644
--- a/configs/learning_gem5/part3/msi_caches.py
+++ b/configs/learning_gem5/part3/msi_caches.py
@@ -84,6 +84,7 @@ class MyCacheSystem(RubySystem):
                 # I/D cache is combined and grab from ctrl
                 dcache=self.controllers[i].cacheMemory,
                 clk_domain=self.controllers[i].clk_domain,
+                ruby_system=self,
             )
             for i in range(len(cpus))
         ]
@@ -191,7 +192,9 @@ class DirController(Directory_Controller):
         self.version = self.versionCount()
         self.addr_ranges = ranges
         self.ruby_system = ruby_system
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
         # Connect this directory to the memory side.
         self.memory = mem_ctrls[0].port
         self.connectQueues(ruby_system)
diff --git a/configs/learning_gem5/part3/ruby_caches_MI_example.py b/configs/learning_gem5/part3/ruby_caches_MI_example.py
index baee120bb9..583041a674 100644
--- a/configs/learning_gem5/part3/ruby_caches_MI_example.py
+++ b/configs/learning_gem5/part3/ruby_caches_MI_example.py
@@ -84,6 +84,7 @@ class MyCacheSystem(RubySystem):
                 # I/D cache is combined and grab from ctrl
                 dcache=self.controllers[i].cacheMemory,
                 clk_domain=self.controllers[i].clk_domain,
+                ruby_system=self,
             )
             for i in range(len(cpus))
         ]
@@ -180,7 +181,9 @@ class DirController(Directory_Controller):
         self.version = self.versionCount()
         self.addr_ranges = ranges
         self.ruby_system = ruby_system
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
         # Connect this directory to the memory side.
         self.memory = mem_ctrls[0].port
         self.connectQueues(ruby_system)
diff --git a/configs/learning_gem5/part3/test_caches.py b/configs/learning_gem5/part3/test_caches.py
index 4e8e8febda..be2d46253e 100644
--- a/configs/learning_gem5/part3/test_caches.py
+++ b/configs/learning_gem5/part3/test_caches.py
@@ -79,6 +79,7 @@ class TestCacheSystem(RubySystem):
                 # I/D cache is combined and grab from ctrl
                 dcache=self.controllers[i].cacheMemory,
                 clk_domain=self.clk_domain,
+                ruby_system=self,
             )
             for i in range(num_testers)
         ]
diff --git a/configs/ruby/AMD_Base_Constructor.py b/configs/ruby/AMD_Base_Constructor.py
index ff4246a7e0..7d40862517 100644
--- a/configs/ruby/AMD_Base_Constructor.py
+++ b/configs/ruby/AMD_Base_Constructor.py
@@ -84,14 +84,14 @@ class CPCntrl(AMD_Base_Controller, CntrlBase):
         self.L2cache = L2Cache()
         self.L2cache.create(options.l2_size, options.l2_assoc, options)
 
-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
         self.sequencer.version = self.seqCount()
         self.sequencer.dcache = self.L1D0cache
         self.sequencer.ruby_system = ruby_system
         self.sequencer.coreid = 0
         self.sequencer.is_cpu_sequencer = True
 
-        self.sequencer1 = RubySequencer()
+        self.sequencer1 = RubySequencer(ruby_system=ruby_system)
         self.sequencer1.version = self.seqCount()
         self.sequencer1.dcache = self.L1D1cache
         self.sequencer1.ruby_system = ruby_system
diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py
index 313d1d514a..15108bb674 100644
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -114,14 +114,14 @@ class CPCntrl(CorePair_Controller, CntrlBase):
         self.L2cache = L2Cache()
         self.L2cache.create(options.l2_size, options.l2_assoc, options)
 
-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
         self.sequencer.version = self.seqCount()
         self.sequencer.dcache = self.L1D0cache
         self.sequencer.ruby_system = ruby_system
         self.sequencer.coreid = 0
         self.sequencer.is_cpu_sequencer = True
 
-        self.sequencer1 = RubySequencer()
+        self.sequencer1 = RubySequencer(ruby_system=ruby_system)
         self.sequencer1.version = self.seqCount()
         self.sequencer1.dcache = self.L1D1cache
         self.sequencer1.ruby_system = ruby_system
@@ -169,7 +169,7 @@ class TCPCntrl(TCP_Controller, CntrlBase):
         # TCP_Controller inherits this from RubyController
         self.mandatory_queue_latency = options.mandatory_queue_latency
 
-        self.coalescer = VIPERCoalescer()
+        self.coalescer = VIPERCoalescer(ruby_system=ruby_system)
         self.coalescer.version = self.seqCount()
         self.coalescer.icache = self.L1cache
         self.coalescer.dcache = self.L1cache
@@ -182,7 +182,7 @@ class TCPCntrl(TCP_Controller, CntrlBase):
             options.max_coalesces_per_cycle
         )
 
-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
         self.sequencer.version = self.seqCount()
         self.sequencer.dcache = self.L1cache
         self.sequencer.ruby_system = ruby_system
@@ -211,7 +211,7 @@ class TCPCntrl(TCP_Controller, CntrlBase):
         self.L1cache.create(options)
         self.issue_latency = 1
 
-        self.coalescer = VIPERCoalescer()
+        self.coalescer = VIPERCoalescer(ruby_system=ruby_system)
         self.coalescer.version = self.seqCount()
         self.coalescer.icache = self.L1cache
         self.coalescer.dcache = self.L1cache
@@ -219,7 +219,7 @@ class TCPCntrl(TCP_Controller, CntrlBase):
         self.coalescer.support_inst_reqs = False
         self.coalescer.is_cpu_sequencer = False
 
-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
         self.sequencer.version = self.seqCount()
         self.sequencer.dcache = self.L1cache
         self.sequencer.ruby_system = ruby_system
@@ -387,7 +387,9 @@ class DirCntrl(Directory_Controller, CntrlBase):
         self.response_latency = 30
 
         self.addr_ranges = dir_ranges
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
 
         self.L3CacheMemory = L3Cache()
         self.L3CacheMemory.create(options, ruby_system, system)
@@ -686,7 +688,7 @@ def construct_gpudirs(options, system, ruby_system, network):
         dir_cntrl.addr_ranges = dram_intf.range
 
         # Append
-        exec("system.ruby.gpu_dir_cntrl%d = dir_cntrl" % i)
+        exec("ruby_system.gpu_dir_cntrl%d = dir_cntrl" % i)
         dir_cntrl_nodes.append(dir_cntrl)
         mem_ctrls.append(mem_ctrl)
 
diff --git a/configs/ruby/MESI_Three_Level.py b/configs/ruby/MESI_Three_Level.py
index e0de4e0636..9054fefc01 100644
--- a/configs/ruby/MESI_Three_Level.py
+++ b/configs/ruby/MESI_Three_Level.py
@@ -148,6 +148,7 @@ def create_system(
                 train_misses=5,
                 num_startup_pfs=4,
                 cross_page=True,
+                block_size=options.cacheline_size,
             )
 
             l0_cntrl = L0Cache_Controller(
diff --git a/configs/ruby/MESI_Three_Level_HTM.py b/configs/ruby/MESI_Three_Level_HTM.py
index e6c4e81f91..d7ad3bdc04 100644
--- a/configs/ruby/MESI_Three_Level_HTM.py
+++ b/configs/ruby/MESI_Three_Level_HTM.py
@@ -148,6 +148,7 @@ def create_system(
                 train_misses=5,
                 num_startup_pfs=4,
                 cross_page=True,
+                block_size=options.cacheline_size,
             )
 
             l0_cntrl = L0Cache_Controller(
diff --git a/configs/ruby/MESI_Two_Level.py b/configs/ruby/MESI_Two_Level.py
index 500afbc199..6e1e0b97f3 100644
--- a/configs/ruby/MESI_Two_Level.py
+++ b/configs/ruby/MESI_Two_Level.py
@@ -94,7 +94,7 @@ def create_system(
             is_icache=False,
         )
 
-        prefetcher = RubyPrefetcher()
+        prefetcher = RubyPrefetcher(block_size=options.cacheline_size)
 
         clk_domain = cpus[i].clk_domain
 
diff --git a/configs/ruby/MOESI_AMD_Base.py b/configs/ruby/MOESI_AMD_Base.py
index aeab96a85f..1095defc57 100644
--- a/configs/ruby/MOESI_AMD_Base.py
+++ b/configs/ruby/MOESI_AMD_Base.py
@@ -112,14 +112,14 @@ class CPCntrl(CorePair_Controller, CntrlBase):
         self.L2cache = L2Cache()
         self.L2cache.create(options)
 
-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
         self.sequencer.version = self.seqCount()
         self.sequencer.dcache = self.L1D0cache
         self.sequencer.ruby_system = ruby_system
         self.sequencer.coreid = 0
         self.sequencer.is_cpu_sequencer = True
 
-        self.sequencer1 = RubySequencer()
+        self.sequencer1 = RubySequencer(ruby_system=ruby_system)
         self.sequencer1.version = self.seqCount()
         self.sequencer1.dcache = self.L1D1cache
         self.sequencer1.ruby_system = ruby_system
@@ -194,7 +194,9 @@ class DirCntrl(Directory_Controller, CntrlBase):
         self.response_latency = 30
 
         self.addr_ranges = dir_ranges
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
 
         self.L3CacheMemory = L3Cache()
         self.L3CacheMemory.create(options, ruby_system, system)
diff --git a/configs/ruby/Ruby.py b/configs/ruby/Ruby.py
index e427a39de8..0a6671aa4b 100644
--- a/configs/ruby/Ruby.py
+++ b/configs/ruby/Ruby.py
@@ -308,7 +308,9 @@ def create_directories(options, bootmem, ruby_system, system):
     for i in range(options.num_dirs):
         dir_cntrl = Directory_Controller()
         dir_cntrl.version = i
-        dir_cntrl.directory = RubyDirectoryMemory()
+        dir_cntrl.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
         dir_cntrl.ruby_system = ruby_system
 
         exec("ruby_system.dir_cntrl%d = dir_cntrl" % i)
@@ -316,7 +318,9 @@ def create_directories(options, bootmem, ruby_system, system):
 
     if bootmem is not None:
         rom_dir_cntrl = Directory_Controller()
-        rom_dir_cntrl.directory = RubyDirectoryMemory()
+        rom_dir_cntrl.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
         rom_dir_cntrl.ruby_system = ruby_system
         rom_dir_cntrl.version = i + 1
         rom_dir_cntrl.memory = bootmem.port
diff --git a/src/cpu/testers/gpu_ruby_test/TesterThread.py b/src/cpu/testers/gpu_ruby_test/TesterThread.py
index 49388a76e1..6ddfc66ddc 100644
--- a/src/cpu/testers/gpu_ruby_test/TesterThread.py
+++ b/src/cpu/testers/gpu_ruby_test/TesterThread.py
@@ -41,3 +41,4 @@ class TesterThread(ClockedObject):
     thread_id = Param.Int("Unique TesterThread ID")
     num_lanes = Param.Int("Number of lanes this thread has")
     deadlock_threshold = Param.Cycles(1000000000, "Deadlock threshold")
+    cache_line_size = Param.UInt32("Size of cache line in cache")
diff --git a/src/cpu/testers/gpu_ruby_test/address_manager.cc b/src/cpu/testers/gpu_ruby_test/address_manager.cc
index a0c0670a8f..83d8a1a277 100644
--- a/src/cpu/testers/gpu_ruby_test/address_manager.cc
+++ b/src/cpu/testers/gpu_ruby_test/address_manager.cc
@@ -64,7 +64,9 @@ AddressManager::AddressManager(int n_atomic_locs, int n_normal_locs_per_atomic)
     std::shuffle(
         randAddressMap.begin(),
         randAddressMap.end(),
-        std::default_random_engine(random_mt.random<unsigned>(0,UINT_MAX))
+        // TODO: This is a bug unrelated to this draft PR but the GPU tester is
+        // useful for testing this PR.
+        std::default_random_engine(random_mt.random<unsigned>(0,UINT_MAX-1))
     );
 
     // initialize atomic locations
diff --git a/src/cpu/testers/gpu_ruby_test/dma_thread.cc b/src/cpu/testers/gpu_ruby_test/dma_thread.cc
index 1d6f46c44b..2c4c610c51 100644
--- a/src/cpu/testers/gpu_ruby_test/dma_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/dma_thread.cc
@@ -70,7 +70,7 @@ DmaThread::issueLoadOps()
         Addr address = addrManager->getAddress(location);
         DPRINTF(ProtocolTest, "%s Episode %d: Issuing Load - Addr %s\n",
                 this->getName(), curEpisode->getEpisodeId(),
-                ruby::printAddress(address));
+                printAddress(address));
 
         int load_size = sizeof(Value);
 
@@ -127,7 +127,7 @@ DmaThread::issueStoreOps()
 
         DPRINTF(ProtocolTest, "%s Episode %d: Issuing Store - Addr %s - "
                 "Value %d\n", this->getName(),
-                curEpisode->getEpisodeId(), ruby::printAddress(address),
+                curEpisode->getEpisodeId(), printAddress(address),
                 new_value);
 
         auto req = std::make_shared<Request>(address, sizeof(Value),
@@ -211,7 +211,7 @@ DmaThread::hitCallback(PacketPtr pkt)
 
     DPRINTF(ProtocolTest, "%s Episode %d: hitCallback - Command %s -"
             " Addr %s\n", this->getName(), curEpisode->getEpisodeId(),
-            resp_cmd.toString(), ruby::printAddress(addr));
+            resp_cmd.toString(), printAddress(addr));
 
     if (resp_cmd == MemCmd::SwapResp) {
         // response to a pending atomic
diff --git a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
index ae4078ee6c..516e77ddae 100644
--- a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
+++ b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
@@ -67,7 +67,7 @@ GpuWavefront::issueLoadOps()
             Addr address = addrManager->getAddress(location);
             DPRINTF(ProtocolTest, "%s Episode %d: Issuing Load - Addr %s\n",
                     this->getName(), curEpisode->getEpisodeId(),
-                    ruby::printAddress(address));
+                    printAddress(address));
 
             int load_size = sizeof(Value);
 
@@ -124,7 +124,7 @@ GpuWavefront::issueStoreOps()
 
             DPRINTF(ProtocolTest, "%s Episode %d: Issuing Store - Addr %s - "
                     "Value %d\n", this->getName(),
-                    curEpisode->getEpisodeId(), ruby::printAddress(address),
+                    curEpisode->getEpisodeId(), printAddress(address),
                     new_value);
 
             auto req = std::make_shared<Request>(address, sizeof(Value),
@@ -178,7 +178,7 @@ GpuWavefront::issueAtomicOps()
 
         DPRINTF(ProtocolTest, "%s Episode %d: Issuing Atomic_Inc - Addr %s\n",
                 this->getName(), curEpisode->getEpisodeId(),
-                ruby::printAddress(address));
+                printAddress(address));
 
         // must be aligned with store size
         assert(address % sizeof(Value) == 0);
@@ -268,7 +268,7 @@ GpuWavefront::hitCallback(PacketPtr pkt)
     DPRINTF(ProtocolTest, "%s Episode %d: hitCallback - Command %s - "
                     "Addr %s\n", this->getName(),
                     curEpisode->getEpisodeId(), resp_cmd.toString(),
-                    ruby::printAddress(addr));
+                    printAddress(addr));
 
     // whether the transaction is done after this hitCallback
     bool isTransactionDone = true;
diff --git a/src/cpu/testers/gpu_ruby_test/tester_thread.cc b/src/cpu/testers/gpu_ruby_test/tester_thread.cc
index ce3a1bccc6..dbcfba8c3c 100644
--- a/src/cpu/testers/gpu_ruby_test/tester_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/tester_thread.cc
@@ -43,6 +43,7 @@ TesterThread::TesterThread(const Params &p)
       : ClockedObject(p),
         threadEvent(this, "TesterThread tick"),
         deadlockCheckEvent(this),
+        cacheLineSize(p.cache_line_size),
         threadId(p.thread_id),
         numLanes(p.num_lanes),
         tester(nullptr), addrManager(nullptr), port(nullptr),
@@ -383,7 +384,7 @@ TesterThread::validateAtomicResp(Location loc, int lane, Value ret_val)
         ss << threadName << ": Atomic Op returned unexpected value\n"
            << "\tEpisode " << curEpisode->getEpisodeId() << "\n"
            << "\tLane ID " << lane << "\n"
-           << "\tAddress " << ruby::printAddress(addr) << "\n"
+           << "\tAddress " << printAddress(addr) << "\n"
            << "\tAtomic Op's return value " << ret_val << "\n";
 
         // print out basic info
@@ -409,7 +410,7 @@ TesterThread::validateLoadResp(Location loc, int lane, Value ret_val)
            << "\tTesterThread " << threadId << "\n"
            << "\tEpisode " << curEpisode->getEpisodeId() << "\n"
            << "\tLane ID " << lane << "\n"
-           << "\tAddress " << ruby::printAddress(addr) << "\n"
+           << "\tAddress " << printAddress(addr) << "\n"
            << "\tLoaded value " << ret_val << "\n"
            << "\tLast writer " << addrManager->printLastWriter(loc) << "\n";
 
@@ -467,7 +468,7 @@ TesterThread::printOutstandingReqs(const OutstandingReqTable& table,
 
     for (const auto& m : table) {
         for (const auto& req : m.second) {
-            ss << "\t\t\tAddr " << ruby::printAddress(m.first)
+            ss << "\t\t\tAddr " << printAddress(m.first)
                << ": delta (curCycle - issueCycle) = "
                << (cur_cycle - req.issueCycle) << std::endl;
         }
@@ -488,4 +489,10 @@ TesterThread::printAllOutstandingReqs(std::stringstream& ss) const
        << pendingFenceCount << std::endl;
 }
 
+std::string
+TesterThread::printAddress(Addr addr) const
+{
+    return ruby::printAddress(addr, cacheLineSize * 8);
+}
+
 } // namespace gem5
diff --git a/src/cpu/testers/gpu_ruby_test/tester_thread.hh b/src/cpu/testers/gpu_ruby_test/tester_thread.hh
index 9877d63c24..f31a5a3dea 100644
--- a/src/cpu/testers/gpu_ruby_test/tester_thread.hh
+++ b/src/cpu/testers/gpu_ruby_test/tester_thread.hh
@@ -132,6 +132,7 @@ class TesterThread : public ClockedObject
         {}
     };
 
+    int cacheLineSize;
     // the unique global id of this thread
     int threadId;
     // width of this thread (1 for cpu thread & wf size for gpu wavefront)
@@ -204,6 +205,7 @@ class TesterThread : public ClockedObject
 
     void printOutstandingReqs(const OutstandingReqTable& table,
                               std::stringstream& ss) const;
+    std::string printAddress(Addr addr) const;
 };
 
 } // namespace gem5
diff --git a/src/cpu/testers/rubytest/Check.cc b/src/cpu/testers/rubytest/Check.cc
index 5a83d9ca27..b9c777526a 100644
--- a/src/cpu/testers/rubytest/Check.cc
+++ b/src/cpu/testers/rubytest/Check.cc
@@ -124,7 +124,8 @@ Check::initiatePrefetch()
 
     // push the subblock onto the sender state.  The sequencer will
     // update the subblock on the return
-    pkt->senderState = new SenderState(m_address, req->getSize());
+    pkt->senderState = new SenderState(m_address, req->getSize(),
+                                       CACHE_LINE_BITS);
 
     if (port->sendTimingReq(pkt)) {
         DPRINTF(RubyTest, "successfully initiated prefetch.\n");
@@ -161,7 +162,8 @@ Check::initiateFlush()
 
     // push the subblock onto the sender state.  The sequencer will
     // update the subblock on the return
-    pkt->senderState = new SenderState(m_address, req->getSize());
+    pkt->senderState = new SenderState(m_address, req->getSize(),
+                                       CACHE_LINE_BITS);
 
     if (port->sendTimingReq(pkt)) {
         DPRINTF(RubyTest, "initiating Flush - successful\n");
@@ -207,7 +209,8 @@ Check::initiateAction()
 
     // push the subblock onto the sender state.  The sequencer will
     // update the subblock on the return
-    pkt->senderState = new SenderState(writeAddr, req->getSize());
+    pkt->senderState = new SenderState(m_address, req->getSize(),
+                                       CACHE_LINE_BITS);
 
     if (port->sendTimingReq(pkt)) {
         DPRINTF(RubyTest, "initiating action - successful\n");
@@ -261,7 +264,8 @@ Check::initiateCheck()
 
     // push the subblock onto the sender state.  The sequencer will
     // update the subblock on the return
-    pkt->senderState = new SenderState(m_address, req->getSize());
+    pkt->senderState = new SenderState(m_address, req->getSize(),
+                                       CACHE_LINE_BITS);
 
     if (port->sendTimingReq(pkt)) {
         DPRINTF(RubyTest, "initiating check - successful\n");
@@ -291,7 +295,9 @@ Check::performCallback(ruby::NodeID proc, ruby::SubBlock* data, Cycles curTime)
     // This isn't exactly right since we now have multi-byte checks
     //  assert(getAddress() == address);
 
-    assert(ruby::makeLineAddress(m_address) == ruby::makeLineAddress(address));
+    int block_size_bits = CACHE_LINE_BITS;
+    assert(ruby::makeLineAddress(m_address, block_size_bits) ==
+           ruby::makeLineAddress(address, block_size_bits));
     assert(data != NULL);
 
     DPRINTF(RubyTest, "RubyTester Callback\n");
@@ -342,7 +348,7 @@ Check::performCallback(ruby::NodeID proc, ruby::SubBlock* data, Cycles curTime)
     }
 
     DPRINTF(RubyTest, "proc: %d, Address: 0x%x\n", proc,
-            ruby::makeLineAddress(m_address));
+            ruby::makeLineAddress(m_address, block_size_bits));
     DPRINTF(RubyTest, "Callback done\n");
     debugPrint();
 }
diff --git a/src/cpu/testers/rubytest/Check.hh b/src/cpu/testers/rubytest/Check.hh
index 78e2bda77e..0270b800d7 100644
--- a/src/cpu/testers/rubytest/Check.hh
+++ b/src/cpu/testers/rubytest/Check.hh
@@ -47,6 +47,7 @@ class SubBlock;
 
 const int CHECK_SIZE_BITS = 2;
 const int CHECK_SIZE = (1 << CHECK_SIZE_BITS);
+const int CACHE_LINE_BITS = 6;
 
 class Check
 {
diff --git a/src/cpu/testers/rubytest/RubyTester.hh b/src/cpu/testers/rubytest/RubyTester.hh
index 9397126180..d306c405ef 100644
--- a/src/cpu/testers/rubytest/RubyTester.hh
+++ b/src/cpu/testers/rubytest/RubyTester.hh
@@ -90,7 +90,9 @@ class RubyTester : public ClockedObject
     {
         ruby::SubBlock subBlock;
 
-        SenderState(Addr addr, int size) : subBlock(addr, size) {}
+        SenderState(Addr addr, int size, int cl_size)
+            : subBlock(addr, size, cl_size)
+        {}
 
     };
 
diff --git a/src/mem/ruby/common/Address.cc b/src/mem/ruby/common/Address.cc
index fcf291af51..8b120324c7 100644
--- a/src/mem/ruby/common/Address.cc
+++ b/src/mem/ruby/common/Address.cc
@@ -51,37 +51,33 @@ maskLowOrderBits(Addr addr, unsigned int number)
 }
 
 Addr
-getOffset(Addr addr)
+getOffset(Addr addr, int cacheLineBits)
 {
-    return bitSelect(addr, 0, RubySystem::getBlockSizeBits() - 1);
-}
-
-Addr
-makeLineAddress(Addr addr)
-{
-    return mbits<Addr>(addr, 63, RubySystem::getBlockSizeBits());
+    assert(cacheLineBits < 64);
+    return bitSelect(addr, 0, cacheLineBits - 1);
 }
 
 Addr
 makeLineAddress(Addr addr, int cacheLineBits)
 {
+    assert(cacheLineBits < 64);
     return maskLowOrderBits(addr, cacheLineBits);
 }
 
 // returns the next stride address based on line address
 Addr
-makeNextStrideAddress(Addr addr, int stride)
+makeNextStrideAddress(Addr addr, int stride, int cacheLineBytes)
 {
-    return makeLineAddress(addr) +
-        static_cast<int>(RubySystem::getBlockSizeBytes()) * stride;
+    return makeLineAddress(addr, floorLog2(cacheLineBytes))
+           + cacheLineBytes * stride;
 }
 
 std::string
-printAddress(Addr addr)
+printAddress(Addr addr, int cacheLineBits)
 {
     std::stringstream out;
     out << "[" << std::hex << "0x" << addr << "," << " line 0x"
-       << makeLineAddress(addr) << std::dec << "]";
+       << makeLineAddress(addr, cacheLineBits) << std::dec << "]";
     return out.str();
 }
 
diff --git a/src/mem/ruby/common/Address.hh b/src/mem/ruby/common/Address.hh
index 565c3c1fb7..51e0b5417a 100644
--- a/src/mem/ruby/common/Address.hh
+++ b/src/mem/ruby/common/Address.hh
@@ -33,6 +33,7 @@
 #include <iomanip>
 #include <iostream>
 
+#include "base/intmath.hh"
 #include "base/types.hh"
 
 namespace gem5
@@ -44,11 +45,10 @@ namespace ruby
 // selects bits inclusive
 Addr bitSelect(Addr addr, unsigned int small, unsigned int big);
 Addr maskLowOrderBits(Addr addr, unsigned int number);
-Addr getOffset(Addr addr);
-Addr makeLineAddress(Addr addr);
+Addr getOffset(Addr addr, int cacheLineBits);
 Addr makeLineAddress(Addr addr, int cacheLineBits);
-Addr makeNextStrideAddress(Addr addr, int stride);
-std::string printAddress(Addr addr);
+Addr makeNextStrideAddress(Addr addr, int stride, int cacheLineBytes);
+std::string printAddress(Addr addr, int cacheLineBits);
 
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/common/DataBlock.cc b/src/mem/ruby/common/DataBlock.cc
index 8f47d0026b..bbc0fd21c8 100644
--- a/src/mem/ruby/common/DataBlock.cc
+++ b/src/mem/ruby/common/DataBlock.cc
@@ -40,8 +40,8 @@
 
 #include "mem/ruby/common/DataBlock.hh"
 
+#include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/WriteMask.hh"
-#include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
 {
@@ -51,17 +51,22 @@ namespace ruby
 
 DataBlock::DataBlock(const DataBlock &cp)
 {
+    assert(cp.isAlloc());
+    assert(cp.getBlockSize() > 0);
+    assert(!m_alloc);
+
     uint8_t *block_update;
-    size_t block_bytes = RubySystem::getBlockSizeBytes();
-    m_data = new uint8_t[block_bytes];
-    memcpy(m_data, cp.m_data, block_bytes);
+    m_block_size = cp.getBlockSize();
+    m_data = new uint8_t[m_block_size];
+    memcpy(m_data, cp.m_data, m_block_size);
     m_alloc = true;
+    m_block_size = m_block_size;
     // If this data block is involved in an atomic operation, the effect
     // of applying the atomic operations on the data block are recorded in
     // m_atomicLog. If so, we must copy over every entry in the change log
     for (size_t i = 0; i < cp.m_atomicLog.size(); i++) {
-        block_update = new uint8_t[block_bytes];
-        memcpy(block_update, cp.m_atomicLog[i], block_bytes);
+        block_update = new uint8_t[m_block_size];
+        memcpy(block_update, cp.m_atomicLog[i], m_block_size);
         m_atomicLog.push_back(block_update);
     }
 }
@@ -69,21 +74,44 @@ DataBlock::DataBlock(const DataBlock &cp)
 void
 DataBlock::alloc()
 {
-    m_data = new uint8_t[RubySystem::getBlockSizeBytes()];
+    assert(!m_alloc);
+
+    if (!m_block_size) {
+        return;
+    }
+
+    m_data = new uint8_t[m_block_size];
     m_alloc = true;
     clear();
 }
 
+void
+DataBlock::realloc(int blk_size)
+{
+    m_block_size = blk_size;
+    assert(m_block_size > 0);
+
+    if (m_alloc) {
+        delete [] m_data;
+        m_alloc = false;
+    }
+    alloc();
+}
+
 void
 DataBlock::clear()
 {
-    memset(m_data, 0, RubySystem::getBlockSizeBytes());
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    memset(m_data, 0, m_block_size);
 }
 
 bool
 DataBlock::equal(const DataBlock& obj) const
 {
-    size_t block_bytes = RubySystem::getBlockSizeBytes();
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    size_t block_bytes = m_block_size;
     // Check that the block contents match
     if (memcmp(m_data, obj.m_data, block_bytes)) {
         return false;
@@ -102,7 +130,9 @@ DataBlock::equal(const DataBlock& obj) const
 void
 DataBlock::copyPartial(const DataBlock &dblk, const WriteMask &mask)
 {
-    for (int i = 0; i < RubySystem::getBlockSizeBytes(); i++) {
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    for (int i = 0; i < m_block_size; i++) {
         if (mask.getMask(i, 1)) {
             m_data[i] = dblk.m_data[i];
         }
@@ -113,7 +143,9 @@ void
 DataBlock::atomicPartial(const DataBlock &dblk, const WriteMask &mask,
         bool isAtomicNoReturn)
 {
-    for (int i = 0; i < RubySystem::getBlockSizeBytes(); i++) {
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    for (int i = 0; i < m_block_size; i++) {
         m_data[i] = dblk.m_data[i];
     }
     mask.performAtomic(m_data, m_atomicLog, isAtomicNoReturn);
@@ -122,7 +154,9 @@ DataBlock::atomicPartial(const DataBlock &dblk, const WriteMask &mask,
 void
 DataBlock::print(std::ostream& out) const
 {
-    int size = RubySystem::getBlockSizeBytes();
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    int size = m_block_size;
     out << "[ ";
     for (int i = 0; i < size; i++) {
         out << std::setw(2) << std::setfill('0') << std::hex
@@ -147,6 +181,7 @@ DataBlock::popAtomicLogEntryFront()
 void
 DataBlock::clearAtomicLogEntries()
 {
+    assert(m_alloc);
     for (auto log : m_atomicLog) {
         delete [] log;
     }
@@ -156,35 +191,59 @@ DataBlock::clearAtomicLogEntries()
 const uint8_t*
 DataBlock::getData(int offset, int len) const
 {
-    assert(offset + len <= RubySystem::getBlockSizeBytes());
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    assert(offset + len <= m_block_size);
     return &m_data[offset];
 }
 
 uint8_t*
 DataBlock::getDataMod(int offset)
 {
+    assert(m_alloc);
     return &m_data[offset];
 }
 
 void
 DataBlock::setData(const uint8_t *data, int offset, int len)
 {
+    assert(m_alloc);
     memcpy(&m_data[offset], data, len);
 }
 
 void
 DataBlock::setData(PacketPtr pkt)
 {
-    int offset = getOffset(pkt->getAddr());
-    assert(offset + pkt->getSize() <= RubySystem::getBlockSizeBytes());
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    int offset = getOffset(pkt->getAddr(), floorLog2(m_block_size));
+    assert(offset + pkt->getSize() <= m_block_size);
     pkt->writeData(&m_data[offset]);
 }
 
 DataBlock &
 DataBlock::operator=(const DataBlock & obj)
 {
+    // Reallocate if needed
+    if (m_alloc && m_block_size != obj.getBlockSize()) {
+        delete [] m_data;
+        m_block_size = obj.getBlockSize();
+        alloc();
+    } else if (!m_alloc) {
+        m_block_size = obj.getBlockSize();
+        alloc();
+
+        // Assume this will be realloc'd later if zero.
+        if (m_block_size == 0) {
+            return *this;
+        }
+    } else {
+        assert(m_alloc && m_block_size == obj.getBlockSize());
+    }
+    assert(m_block_size > 0);
+
     uint8_t *block_update;
-    size_t block_bytes = RubySystem::getBlockSizeBytes();
+    size_t block_bytes = m_block_size;
     // Copy entire block contents from obj to current block
     memcpy(m_data, obj.m_data, block_bytes);
     // If this data block is involved in an atomic operation, the effect
diff --git a/src/mem/ruby/common/DataBlock.hh b/src/mem/ruby/common/DataBlock.hh
index 7456a25f3f..ebfa7d1383 100644
--- a/src/mem/ruby/common/DataBlock.hh
+++ b/src/mem/ruby/common/DataBlock.hh
@@ -61,8 +61,14 @@ class WriteMask;
 class DataBlock
 {
   public:
-    DataBlock()
+    // Ideally this should nost be called. We allow default so that protocols
+    // do not need to be changed.
+    DataBlock() = default;
+
+    DataBlock(int blk_size)
     {
+        assert(!m_alloc);
+        m_block_size = blk_size;
         alloc();
     }
 
@@ -101,10 +107,16 @@ class DataBlock
     bool equal(const DataBlock& obj) const;
     void print(std::ostream& out) const;
 
+    int getBlockSize() const { return m_block_size; }
+    void setBlockSize(int block_size) { realloc(block_size); }
+    bool isAlloc() const { return m_alloc; }
+    void realloc(int blk_size);
+
   private:
     void alloc();
-    uint8_t *m_data;
-    bool m_alloc;
+    uint8_t *m_data = nullptr;
+    bool m_alloc = false;
+    int m_block_size = 0;
 
     // Tracks block changes when atomic ops are applied
     std::deque<uint8_t*> m_atomicLog;
@@ -124,18 +136,21 @@ DataBlock::assign(uint8_t *data)
 inline uint8_t
 DataBlock::getByte(int whichByte) const
 {
+    assert(m_alloc);
     return m_data[whichByte];
 }
 
 inline void
 DataBlock::setByte(int whichByte, uint8_t data)
 {
+    assert(m_alloc);
     m_data[whichByte] = data;
 }
 
 inline void
 DataBlock::copyPartial(const DataBlock & dblk, int offset, int len)
 {
+    assert(m_alloc);
     setData(&dblk.m_data[offset], offset, len);
 }
 
diff --git a/src/mem/ruby/common/NetDest.cc b/src/mem/ruby/common/NetDest.cc
index ba64f2febd..944315b97f 100644
--- a/src/mem/ruby/common/NetDest.cc
+++ b/src/mem/ruby/common/NetDest.cc
@@ -30,6 +30,8 @@
 
 #include <algorithm>
 
+#include "mem/ruby/system/RubySystem.hh"
+
 namespace gem5
 {
 
@@ -38,12 +40,18 @@ namespace ruby
 
 NetDest::NetDest()
 {
-  resize();
+}
+
+NetDest::NetDest(RubySystem *ruby_system)
+    : m_ruby_system(ruby_system)
+{
+    resize();
 }
 
 void
 NetDest::add(MachineID newElement)
 {
+    assert(m_bits.size() > 0);
     assert(bitIndex(newElement.num) < m_bits[vecIndex(newElement)].getSize());
     m_bits[vecIndex(newElement)].add(bitIndex(newElement.num));
 }
@@ -51,6 +59,7 @@ NetDest::add(MachineID newElement)
 void
 NetDest::addNetDest(const NetDest& netDest)
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == netDest.getSize());
     for (int i = 0; i < m_bits.size(); i++) {
         m_bits[i].addSet(netDest.m_bits[i]);
@@ -60,6 +69,8 @@ NetDest::addNetDest(const NetDest& netDest)
 void
 NetDest::setNetDest(MachineType machine, const Set& set)
 {
+    assert(m_ruby_system != nullptr);
+
     // assure that there is only one set of destinations for this machine
     assert(MachineType_base_level((MachineType)(machine + 1)) -
            MachineType_base_level(machine) == 1);
@@ -69,12 +80,14 @@ NetDest::setNetDest(MachineType machine, const Set& set)
 void
 NetDest::remove(MachineID oldElement)
 {
+    assert(m_bits.size() > 0);
     m_bits[vecIndex(oldElement)].remove(bitIndex(oldElement.num));
 }
 
 void
 NetDest::removeNetDest(const NetDest& netDest)
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == netDest.getSize());
     for (int i = 0; i < m_bits.size(); i++) {
         m_bits[i].removeSet(netDest.m_bits[i]);
@@ -84,6 +97,7 @@ NetDest::removeNetDest(const NetDest& netDest)
 void
 NetDest::clear()
 {
+    assert(m_bits.size() > 0);
     for (int i = 0; i < m_bits.size(); i++) {
         m_bits[i].clear();
     }
@@ -101,6 +115,8 @@ NetDest::broadcast()
 void
 NetDest::broadcast(MachineType machineType)
 {
+    assert(m_ruby_system != nullptr);
+
     for (NodeID i = 0; i < MachineType_base_count(machineType); i++) {
         MachineID mach = {machineType, i};
         add(mach);
@@ -111,6 +127,9 @@ NetDest::broadcast(MachineType machineType)
 std::vector<NodeID>
 NetDest::getAllDest()
 {
+    assert(m_ruby_system != nullptr);
+    assert(m_bits.size() > 0);
+
     std::vector<NodeID> dest;
     dest.clear();
     for (int i = 0; i < m_bits.size(); i++) {
@@ -127,6 +146,8 @@ NetDest::getAllDest()
 int
 NetDest::count() const
 {
+    assert(m_bits.size() > 0);
+
     int counter = 0;
     for (int i = 0; i < m_bits.size(); i++) {
         counter += m_bits[i].count();
@@ -137,12 +158,14 @@ NetDest::count() const
 NodeID
 NetDest::elementAt(MachineID index)
 {
+    assert(m_bits.size() > 0);
     return m_bits[vecIndex(index)].elementAt(bitIndex(index.num));
 }
 
 MachineID
 NetDest::smallestElement() const
 {
+    assert(m_bits.size() > 0);
     assert(count() > 0);
     for (int i = 0; i < m_bits.size(); i++) {
         for (NodeID j = 0; j < m_bits[i].getSize(); j++) {
@@ -158,6 +181,9 @@ NetDest::smallestElement() const
 MachineID
 NetDest::smallestElement(MachineType machine) const
 {
+    assert(m_bits.size() > 0);
+    assert(m_ruby_system != nullptr);
+
     int size = m_bits[MachineType_base_level(machine)].getSize();
     for (NodeID j = 0; j < size; j++) {
         if (m_bits[MachineType_base_level(machine)].isElement(j)) {
@@ -173,6 +199,7 @@ NetDest::smallestElement(MachineType machine) const
 bool
 NetDest::isBroadcast() const
 {
+    assert(m_bits.size() > 0);
     for (int i = 0; i < m_bits.size(); i++) {
         if (!m_bits[i].isBroadcast()) {
             return false;
@@ -185,6 +212,7 @@ NetDest::isBroadcast() const
 bool
 NetDest::isEmpty() const
 {
+    assert(m_bits.size() > 0);
     for (int i = 0; i < m_bits.size(); i++) {
         if (!m_bits[i].isEmpty()) {
             return false;
@@ -197,8 +225,9 @@ NetDest::isEmpty() const
 NetDest
 NetDest::OR(const NetDest& orNetDest) const
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == orNetDest.getSize());
-    NetDest result;
+    NetDest result(m_ruby_system);
     for (int i = 0; i < m_bits.size(); i++) {
         result.m_bits[i] = m_bits[i].OR(orNetDest.m_bits[i]);
     }
@@ -209,8 +238,9 @@ NetDest::OR(const NetDest& orNetDest) const
 NetDest
 NetDest::AND(const NetDest& andNetDest) const
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == andNetDest.getSize());
-    NetDest result;
+    NetDest result(m_ruby_system);
     for (int i = 0; i < m_bits.size(); i++) {
         result.m_bits[i] = m_bits[i].AND(andNetDest.m_bits[i]);
     }
@@ -221,6 +251,7 @@ NetDest::AND(const NetDest& andNetDest) const
 bool
 NetDest::intersectionIsNotEmpty(const NetDest& other_netDest) const
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == other_netDest.getSize());
     for (int i = 0; i < m_bits.size(); i++) {
         if (!m_bits[i].intersectionIsEmpty(other_netDest.m_bits[i])) {
@@ -233,6 +264,7 @@ NetDest::intersectionIsNotEmpty(const NetDest& other_netDest) const
 bool
 NetDest::isSuperset(const NetDest& test) const
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == test.getSize());
 
     for (int i = 0; i < m_bits.size(); i++) {
@@ -246,12 +278,15 @@ NetDest::isSuperset(const NetDest& test) const
 bool
 NetDest::isElement(MachineID element) const
 {
+    assert(m_bits.size() > 0);
     return ((m_bits[vecIndex(element)])).isElement(bitIndex(element.num));
 }
 
 void
 NetDest::resize()
 {
+    assert(m_ruby_system != nullptr);
+
     m_bits.resize(MachineType_base_level(MachineType_NUM));
     assert(m_bits.size() == MachineType_NUM);
 
@@ -263,6 +298,7 @@ NetDest::resize()
 void
 NetDest::print(std::ostream& out) const
 {
+    assert(m_bits.size() > 0);
     out << "[NetDest (" << m_bits.size() << ") ";
 
     for (int i = 0; i < m_bits.size(); i++) {
@@ -277,6 +313,7 @@ NetDest::print(std::ostream& out) const
 bool
 NetDest::isEqual(const NetDest& n) const
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == n.m_bits.size());
     for (unsigned int i = 0; i < m_bits.size(); ++i) {
         if (!m_bits[i].isEqual(n.m_bits[i]))
@@ -285,5 +322,19 @@ NetDest::isEqual(const NetDest& n) const
     return true;
 }
 
+int
+NetDest::MachineType_base_count(const MachineType& obj)
+{
+    assert(m_ruby_system != nullptr);
+    return m_ruby_system->MachineType_base_count(obj);
+}
+
+int
+NetDest::MachineType_base_number(const MachineType& obj)
+{
+    assert(m_ruby_system != nullptr);
+    return m_ruby_system->MachineType_base_number(obj);
+}
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/common/NetDest.hh b/src/mem/ruby/common/NetDest.hh
index e71b876754..83f340a478 100644
--- a/src/mem/ruby/common/NetDest.hh
+++ b/src/mem/ruby/common/NetDest.hh
@@ -41,6 +41,8 @@ namespace gem5
 namespace ruby
 {
 
+class RubySystem;
+
 // NetDest specifies the network destination of a Message
 class NetDest
 {
@@ -48,6 +50,7 @@ class NetDest
     // Constructors
     // creates and empty set
     NetDest();
+    NetDest(RubySystem *ruby_system);
     explicit NetDest(int bit_size);
 
     NetDest& operator=(const Set& obj);
@@ -98,6 +101,8 @@ class NetDest
 
     void print(std::ostream& out) const;
 
+    void setRubySystem(RubySystem *rs) { m_ruby_system = rs; resize(); }
+
   private:
     // returns a value >= MachineType_base_level("this machine")
     // and < MachineType_base_level("next highest machine")
@@ -112,6 +117,12 @@ class NetDest
     NodeID bitIndex(NodeID index) const { return index; }
 
     std::vector<Set> m_bits;  // a vector of bit vectors - i.e. Sets
+
+    // Needed to call MacheinType_base_count/level
+    RubySystem *m_ruby_system = nullptr;
+
+    int MachineType_base_count(const MachineType& obj);
+    int MachineType_base_number(const MachineType& obj);
 };
 
 inline std::ostream&
diff --git a/src/mem/ruby/common/SubBlock.cc b/src/mem/ruby/common/SubBlock.cc
index 92cfd8b633..be0adc1233 100644
--- a/src/mem/ruby/common/SubBlock.cc
+++ b/src/mem/ruby/common/SubBlock.cc
@@ -38,13 +38,14 @@ namespace ruby
 
 using stl_helpers::operator<<;
 
-SubBlock::SubBlock(Addr addr, int size)
+SubBlock::SubBlock(Addr addr, int size, int cl_bits)
 {
     m_address = addr;
     resize(size);
     for (int i = 0; i < size; i++) {
         setByte(i, 0);
     }
+    m_cache_line_bits = cl_bits;
 }
 
 void
@@ -52,7 +53,7 @@ SubBlock::internalMergeFrom(const DataBlock& data)
 {
     int size = getSize();
     assert(size > 0);
-    int offset = getOffset(m_address);
+    int offset = getOffset(m_address, m_cache_line_bits);
     for (int i = 0; i < size; i++) {
         this->setByte(i, data.getByte(offset + i));
     }
@@ -63,7 +64,7 @@ SubBlock::internalMergeTo(DataBlock& data) const
 {
     int size = getSize();
     assert(size > 0);
-    int offset = getOffset(m_address);
+    int offset = getOffset(m_address, m_cache_line_bits);
     for (int i = 0; i < size; i++) {
         // This will detect crossing a cache line boundary
         data.setByte(offset + i, this->getByte(i));
diff --git a/src/mem/ruby/common/SubBlock.hh b/src/mem/ruby/common/SubBlock.hh
index e1a83600c2..3790bbac58 100644
--- a/src/mem/ruby/common/SubBlock.hh
+++ b/src/mem/ruby/common/SubBlock.hh
@@ -45,7 +45,7 @@ class SubBlock
 {
   public:
     SubBlock() { }
-    SubBlock(Addr addr, int size);
+    SubBlock(Addr addr, int size, int cl_bits);
     ~SubBlock() { }
 
     Addr getAddress() const { return m_address; }
@@ -74,6 +74,7 @@ class SubBlock
     // Data Members (m_ prefix)
     Addr m_address;
     std::vector<uint8_t> m_data;
+    int m_cache_line_bits;
 };
 
 inline std::ostream&
diff --git a/src/mem/ruby/common/WriteMask.cc b/src/mem/ruby/common/WriteMask.cc
index 1fa03c951e..f176aec9fc 100644
--- a/src/mem/ruby/common/WriteMask.cc
+++ b/src/mem/ruby/common/WriteMask.cc
@@ -39,13 +39,13 @@ namespace ruby
 {
 
 WriteMask::WriteMask()
-    : mSize(RubySystem::getBlockSizeBytes()), mMask(mSize, false),
-      mAtomic(false)
+    : mSize(0), mMask(mSize, false), mAtomic(false)
 {}
 
 void
 WriteMask::print(std::ostream& out) const
 {
+    assert(mSize > 0);
     std::string str(mSize,'0');
     for (int i = 0; i < mSize; i++) {
         str[i] = mMask[i] ? ('1') : ('0');
@@ -59,6 +59,7 @@ void
 WriteMask::performAtomic(uint8_t * p,
         std::deque<uint8_t*>& log, bool isAtomicNoReturn) const
 {
+    assert(mSize > 0);
     int offset;
     uint8_t *block_update;
     // Here, operations occur in FIFO order from the mAtomicOp
diff --git a/src/mem/ruby/common/WriteMask.hh b/src/mem/ruby/common/WriteMask.hh
index 8c6b8ce976..e620997cd8 100644
--- a/src/mem/ruby/common/WriteMask.hh
+++ b/src/mem/ruby/common/WriteMask.hh
@@ -78,6 +78,17 @@ class WriteMask
     ~WriteMask()
     {}
 
+    int getBlockSize() const { return mSize; }
+    void
+    setBlockSize(int size)
+    {
+        // This should only be used once if the default ctor was used. Probably
+        // by src/mem/ruby/protocol/RubySlicc_MemControl.sm.
+        assert(mSize == 0);
+        assert(size > 0);
+        mSize = size;
+    }
+
     void
     clear()
     {
@@ -87,6 +98,7 @@ class WriteMask
     bool
     test(int offset) const
     {
+        assert(mSize > 0);
         assert(offset < mSize);
         return mMask[offset];
     }
@@ -94,6 +106,7 @@ class WriteMask
     void
     setMask(int offset, int len, bool val = true)
     {
+        assert(mSize > 0);
         assert(mSize >= (offset + len));
         for (int i = 0; i < len; i++) {
             mMask[offset + i] = val;
@@ -102,6 +115,7 @@ class WriteMask
     void
     fillMask()
     {
+        assert(mSize > 0);
         for (int i = 0; i < mSize; i++) {
             mMask[i] = true;
         }
@@ -111,6 +125,7 @@ class WriteMask
     getMask(int offset, int len) const
     {
         bool tmp = true;
+        assert(mSize > 0);
         assert(mSize >= (offset + len));
         for (int i = 0; i < len; i++) {
             tmp = tmp & mMask.at(offset + i);
@@ -122,6 +137,7 @@ class WriteMask
     isOverlap(const WriteMask &readMask) const
     {
         bool tmp = false;
+        assert(mSize > 0);
         assert(mSize == readMask.mSize);
         for (int i = 0; i < mSize; i++) {
             if (readMask.mMask.at(i)) {
@@ -135,6 +151,7 @@ class WriteMask
     containsMask(const WriteMask &readMask) const
     {
         bool tmp = true;
+        assert(mSize > 0);
         assert(mSize == readMask.mSize);
         for (int i = 0; i < mSize; i++) {
             if (readMask.mMask.at(i)) {
@@ -146,6 +163,7 @@ class WriteMask
 
     bool isEmpty() const
     {
+        assert(mSize > 0);
         for (int i = 0; i < mSize; i++) {
             if (mMask.at(i)) {
                 return false;
@@ -157,6 +175,7 @@ class WriteMask
     bool
     isFull() const
     {
+        assert(mSize > 0);
         for (int i = 0; i < mSize; i++) {
             if (!mMask.at(i)) {
                 return false;
@@ -168,6 +187,7 @@ class WriteMask
     void
     andMask(const WriteMask & writeMask)
     {
+        assert(mSize > 0);
         assert(mSize == writeMask.mSize);
         for (int i = 0; i < mSize; i++) {
             mMask[i] = (mMask.at(i)) && (writeMask.mMask.at(i));
@@ -182,6 +202,7 @@ class WriteMask
     void
     orMask(const WriteMask & writeMask)
     {
+        assert(mSize > 0);
         assert(mSize == writeMask.mSize);
         for (int i = 0; i < mSize; i++) {
             mMask[i] = (mMask.at(i)) || (writeMask.mMask.at(i));
@@ -196,6 +217,7 @@ class WriteMask
     void
     setInvertedMask(const WriteMask & writeMask)
     {
+        assert(mSize > 0);
         assert(mSize == writeMask.mSize);
         for (int i = 0; i < mSize; i++) {
             mMask[i] = !writeMask.mMask.at(i);
@@ -205,6 +227,7 @@ class WriteMask
     int
     firstBitSet(bool val, int offset = 0) const
     {
+        assert(mSize > 0);
         for (int i = offset; i < mSize; ++i)
             if (mMask[i] == val)
                 return i;
@@ -214,6 +237,7 @@ class WriteMask
     int
     count(int offset = 0) const
     {
+        assert(mSize > 0);
         int count = 0;
         for (int i = offset; i < mSize; ++i)
             count += mMask[i];
diff --git a/src/mem/ruby/network/MessageBuffer.cc b/src/mem/ruby/network/MessageBuffer.cc
index 9a4439a538..8b3a724469 100644
--- a/src/mem/ruby/network/MessageBuffer.cc
+++ b/src/mem/ruby/network/MessageBuffer.cc
@@ -47,7 +47,6 @@
 #include "base/random.hh"
 #include "base/stl_helpers.hh"
 #include "debug/RubyQueue.hh"
-#include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
 {
@@ -216,6 +215,7 @@ random_time()
 
 void
 MessageBuffer::enqueue(MsgPtr message, Tick current_time, Tick delta,
+                       bool ruby_is_random, bool ruby_warmup,
                        bool bypassStrictFIFO)
 {
     // record current time incase we have a pop that also adjusts my size
@@ -237,7 +237,7 @@ MessageBuffer::enqueue(MsgPtr message, Tick current_time, Tick delta,
     // is turned on and this buffer allows it
     if ((m_randomization == MessageRandomization::disabled) ||
         ((m_randomization == MessageRandomization::ruby_system) &&
-          !RubySystem::getRandomization())) {
+          !ruby_is_random)) {
         // No randomization
         arrival_time = current_time + delta;
     } else {
@@ -265,7 +265,7 @@ MessageBuffer::enqueue(MsgPtr message, Tick current_time, Tick delta,
     }
 
     // If running a cache trace, don't worry about the last arrival checks
-    if (!RubySystem::getWarmupEnabled()) {
+    if (!ruby_warmup) {
         m_last_arrival_time = arrival_time;
     }
 
@@ -447,7 +447,6 @@ MessageBuffer::stallMessage(Addr addr, Tick current_time)
 {
     DPRINTF(RubyQueue, "Stalling due to %#x\n", addr);
     assert(isReady(current_time));
-    assert(getOffset(addr) == 0);
     MsgPtr message = m_prio_heap.front();
 
     // Since the message will just be moved to stall map, indicate that the
@@ -479,7 +478,8 @@ MessageBuffer::deferEnqueueingMessage(Addr addr, MsgPtr message)
 }
 
 void
-MessageBuffer::enqueueDeferredMessages(Addr addr, Tick curTime, Tick delay)
+MessageBuffer::enqueueDeferredMessages(Addr addr, Tick curTime, Tick delay,
+                                       bool ruby_is_random, bool ruby_warmup)
 {
     assert(!isDeferredMsgMapEmpty(addr));
     std::vector<MsgPtr>& msg_vec = m_deferred_msg_map[addr];
@@ -487,7 +487,7 @@ MessageBuffer::enqueueDeferredMessages(Addr addr, Tick curTime, Tick delay)
 
     // enqueue all deferred messages associated with this address
     for (MsgPtr m : msg_vec) {
-        enqueue(m, curTime, delay);
+        enqueue(m, curTime, delay, ruby_is_random, ruby_warmup);
     }
 
     msg_vec.clear();
diff --git a/src/mem/ruby/network/MessageBuffer.hh b/src/mem/ruby/network/MessageBuffer.hh
index 03a0454433..b45e531d11 100644
--- a/src/mem/ruby/network/MessageBuffer.hh
+++ b/src/mem/ruby/network/MessageBuffer.hh
@@ -90,13 +90,14 @@ class MessageBuffer : public SimObject
     Tick readyTime() const;
 
     void
-    delayHead(Tick current_time, Tick delta)
+    delayHead(Tick current_time, Tick delta, bool ruby_is_random,
+              bool ruby_warmup)
     {
         MsgPtr m = m_prio_heap.front();
         std::pop_heap(m_prio_heap.begin(), m_prio_heap.end(),
                       std::greater<MsgPtr>());
         m_prio_heap.pop_back();
-        enqueue(m, current_time, delta);
+        enqueue(m, current_time, delta, ruby_is_random, ruby_warmup);
     }
 
     bool areNSlotsAvailable(unsigned int n, Tick curTime);
@@ -124,6 +125,7 @@ class MessageBuffer : public SimObject
     const MsgPtr &peekMsgPtr() const { return m_prio_heap.front(); }
 
     void enqueue(MsgPtr message, Tick curTime, Tick delta,
+                bool ruby_is_random, bool ruby_warmup,
                 bool bypassStrictFIFO = false);
 
     // Defer enqueueing a message to a later cycle by putting it aside and not
@@ -135,7 +137,8 @@ class MessageBuffer : public SimObject
 
     // enqueue all previously deferred messages that are associated with the
     // input address
-    void enqueueDeferredMessages(Addr addr, Tick curTime, Tick delay);
+    void enqueueDeferredMessages(Addr addr, Tick curTime, Tick delay,
+                                 bool ruby_is_random, bool ruby_warmup);
     bool isDeferredMsgMapEmpty(Addr addr) const;
 
     //! Updates the delay cycles of the message at the head of the queue,
diff --git a/src/mem/ruby/network/Network.cc b/src/mem/ruby/network/Network.cc
index 757ed9498e..480b5bcef0 100644
--- a/src/mem/ruby/network/Network.cc
+++ b/src/mem/ruby/network/Network.cc
@@ -65,7 +65,8 @@ Network::Network(const Params &p)
              "%s: data message size > cache line size", name());
     m_data_msg_size = p.data_msg_size + m_control_msg_size;
 
-    params().ruby_system->registerNetwork(this);
+    m_ruby_system = p.ruby_system;
+    m_ruby_system->registerNetwork(this);
 
     // Populate localNodeVersions with the version of each MachineType in
     // this network. This will be used to compute a global to local ID.
@@ -102,7 +103,8 @@ Network::Network(const Params &p)
 
     m_topology_ptr = new Topology(m_nodes, p.routers.size(),
                                   m_virtual_networks,
-                                  p.ext_links, p.int_links);
+                                  p.ext_links, p.int_links,
+                                  m_ruby_system);
 
     // Allocate to and from queues
     // Queues that are getting messages from protocol
@@ -246,7 +248,7 @@ Network::addressToNodeID(Addr addr, MachineType mtype)
             }
         }
     }
-    return MachineType_base_count(mtype);
+    return m_ruby_system->MachineType_base_count(mtype);
 }
 
 NodeID
@@ -256,5 +258,23 @@ Network::getLocalNodeID(NodeID global_id) const
     return globalToLocalMap.at(global_id);
 }
 
+bool
+Network::getRandomization() const
+{
+    return m_ruby_system->getRandomization();
+}
+
+bool
+Network::getWarmupEnabled() const
+{
+    return m_ruby_system->getWarmupEnabled();
+}
+
+int
+Network::MachineType_base_number(const MachineType& obj)
+{
+    return m_ruby_system->MachineType_base_number(obj);
+}
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/network/Network.hh b/src/mem/ruby/network/Network.hh
index 8ca68a0279..c0d21af240 100644
--- a/src/mem/ruby/network/Network.hh
+++ b/src/mem/ruby/network/Network.hh
@@ -78,6 +78,7 @@ namespace ruby
 
 class NetDest;
 class MessageBuffer;
+class RubySystem;
 
 class Network : public ClockedObject
 {
@@ -147,6 +148,10 @@ class Network : public ClockedObject
 
     NodeID getLocalNodeID(NodeID global_id) const;
 
+    bool getRandomization() const;
+    bool getWarmupEnabled() const;
+    RubySystem *getRubySystem() const { return m_ruby_system; }
+
   protected:
     // Private copy constructor and assignment operator
     Network(const Network& obj);
@@ -176,6 +181,12 @@ class Network : public ClockedObject
     // Global NodeID to local node map. If there are not multiple networks in
     // the same RubySystem, this is a one-to-one mapping of global to local.
     std::unordered_map<NodeID, NodeID> globalToLocalMap;
+
+    // For accessing if randomization/warnup are turned on. We cannot store
+    // those values in the constructor in case we are constructed first.
+    RubySystem *m_ruby_system = nullptr;
+
+    int MachineType_base_number(const MachineType& obj);
 };
 
 inline std::ostream&
diff --git a/src/mem/ruby/network/Topology.cc b/src/mem/ruby/network/Topology.cc
index 39444c9023..b2cd7897f8 100644
--- a/src/mem/ruby/network/Topology.cc
+++ b/src/mem/ruby/network/Topology.cc
@@ -37,6 +37,7 @@
 #include "mem/ruby/network/BasicLink.hh"
 #include "mem/ruby/network/Network.hh"
 #include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
 {
@@ -56,10 +57,12 @@ const int INFINITE_LATENCY = 10000; // Yes, this is a big hack
 Topology::Topology(uint32_t num_nodes, uint32_t num_routers,
                    uint32_t num_vnets,
                    const std::vector<BasicExtLink *> &ext_links,
-                   const std::vector<BasicIntLink *> &int_links)
-    : m_nodes(MachineType_base_number(MachineType_NUM)),
+                   const std::vector<BasicIntLink *> &int_links,
+                   RubySystem *ruby_system)
+    : m_nodes(ruby_system->MachineType_base_number(MachineType_NUM)),
       m_number_of_switches(num_routers), m_vnets(num_vnets),
-      m_ext_link_vector(ext_links), m_int_link_vector(int_links)
+      m_ext_link_vector(ext_links), m_int_link_vector(int_links),
+      m_ruby_system(ruby_system)
 {
     // Total nodes/controllers in network
     assert(m_nodes > 1);
@@ -78,7 +81,8 @@ Topology::Topology(uint32_t num_nodes, uint32_t num_routers,
         AbstractController *abs_cntrl = ext_link->params().ext_node;
         BasicRouter *router = ext_link->params().int_node;
 
-        int machine_base_idx = MachineType_base_number(abs_cntrl->getType());
+        int machine_base_idx =
+            ruby_system->MachineType_base_number(abs_cntrl->getType());
         int ext_idx1 = machine_base_idx + abs_cntrl->getVersion();
         int ext_idx2 = ext_idx1 + m_nodes;
         int int_idx = router->params().router_id + 2*m_nodes;
@@ -189,7 +193,7 @@ Topology::createLinks(Network *net)
     for (int i = 0; i < topology_weights[0].size(); i++) {
         for (int j = 0; j < topology_weights[0][i].size(); j++) {
             std::vector<NetDest> routingMap;
-            routingMap.resize(m_vnets);
+            routingMap.resize(m_vnets, m_ruby_system);
 
             // Not all sources and destinations are connected
             // by direct links. We only construct the links
@@ -264,7 +268,7 @@ Topology::makeLink(Network *net, SwitchID src, SwitchID dest,
         for (int l = 0; l < links.size(); l++) {
             link_entry = links[l];
             std::vector<NetDest> linkRoute;
-            linkRoute.resize(m_vnets);
+            linkRoute.resize(m_vnets, m_ruby_system);
             BasicLink *link = link_entry.link;
             if (link->mVnets.size() == 0) {
                 net->makeExtInLink(src, dest - (2 * m_nodes), link,
@@ -287,7 +291,7 @@ Topology::makeLink(Network *net, SwitchID src, SwitchID dest,
         for (int l = 0; l < links.size(); l++) {
             link_entry = links[l];
             std::vector<NetDest> linkRoute;
-            linkRoute.resize(m_vnets);
+            linkRoute.resize(m_vnets, m_ruby_system);
             BasicLink *link = link_entry.link;
             if (link->mVnets.size() == 0) {
                 net->makeExtOutLink(src - (2 * m_nodes), node, link,
@@ -309,7 +313,7 @@ Topology::makeLink(Network *net, SwitchID src, SwitchID dest,
         for (int l = 0; l < links.size(); l++) {
             link_entry = links[l];
             std::vector<NetDest> linkRoute;
-            linkRoute.resize(m_vnets);
+            linkRoute.resize(m_vnets, m_ruby_system);
             BasicLink *link = link_entry.link;
             if (link->mVnets.size() == 0) {
                 net->makeInternalLink(src - (2 * m_nodes),
@@ -413,16 +417,17 @@ Topology::shortest_path_to_node(SwitchID src, SwitchID next,
                                 const Matrix &weights, const Matrix &dist,
                                 int vnet)
 {
-    NetDest result;
+    NetDest result(m_ruby_system);
     int d = 0;
     int machines;
     int max_machines;
 
     machines = MachineType_NUM;
-    max_machines = MachineType_base_number(MachineType_NUM);
+    max_machines = m_ruby_system->MachineType_base_number(MachineType_NUM);
 
     for (int m = 0; m < machines; m++) {
-        for (NodeID i = 0; i < MachineType_base_count((MachineType)m); i++) {
+        for (NodeID i = 0;
+            i < m_ruby_system->MachineType_base_count((MachineType)m); i++) {
             // we use "d+max_machines" below since the "destination"
             // switches for the machines are numbered
             // [MachineType_base_number(MachineType_NUM)...
diff --git a/src/mem/ruby/network/Topology.hh b/src/mem/ruby/network/Topology.hh
index 301811e6ab..7ab395762a 100644
--- a/src/mem/ruby/network/Topology.hh
+++ b/src/mem/ruby/network/Topology.hh
@@ -80,7 +80,8 @@ class Topology
   public:
     Topology(uint32_t num_nodes, uint32_t num_routers, uint32_t num_vnets,
              const std::vector<BasicExtLink *> &ext_links,
-             const std::vector<BasicIntLink *> &int_links);
+             const std::vector<BasicIntLink *> &int_links,
+             RubySystem *ruby_system);
 
     uint32_t numSwitches() const { return m_number_of_switches; }
     void createLinks(Network *net);
@@ -108,7 +109,7 @@ class Topology
                                   const Matrix &weights, const Matrix &dist,
                                   int vnet);
 
-    const uint32_t m_nodes;
+    uint32_t m_nodes;
     const uint32_t m_number_of_switches;
     int m_vnets;
 
@@ -116,6 +117,8 @@ class Topology
     std::vector<BasicIntLink*> m_int_link_vector;
 
     LinkMap m_link_map;
+
+    RubySystem *m_ruby_system = nullptr;
 };
 
 inline std::ostream&
diff --git a/src/mem/ruby/network/garnet/NetworkInterface.cc b/src/mem/ruby/network/garnet/NetworkInterface.cc
index 31d625c4d5..8564baca6d 100644
--- a/src/mem/ruby/network/garnet/NetworkInterface.cc
+++ b/src/mem/ruby/network/garnet/NetworkInterface.cc
@@ -41,6 +41,7 @@
 #include "mem/ruby/network/garnet/Credit.hh"
 #include "mem/ruby/network/garnet/flitBuffer.hh"
 #include "mem/ruby/slicc_interface/Message.hh"
+#include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
 {
@@ -244,7 +245,9 @@ NetworkInterface::wakeup()
                     outNode_ptr[vnet]->areNSlotsAvailable(1, curTime)) {
                     // Space is available. Enqueue to protocol buffer.
                     outNode_ptr[vnet]->enqueue(t_flit->get_msg_ptr(), curTime,
-                                               cyclesToTicks(Cycles(1)));
+                                               cyclesToTicks(Cycles(1)),
+                                               m_net_ptr->getRandomization(),
+                                               m_net_ptr->getWarmupEnabled());
 
                     // Simply send a credit back since we are not buffering
                     // this flit in the NI
@@ -332,7 +335,9 @@ NetworkInterface::checkStallQueue()
                 if (outNode_ptr[vnet]->areNSlotsAvailable(1,
                     curTime)) {
                     outNode_ptr[vnet]->enqueue(stallFlit->get_msg_ptr(),
-                        curTime, cyclesToTicks(Cycles(1)));
+                        curTime, cyclesToTicks(Cycles(1)),
+                        m_net_ptr->getRandomization(),
+                        m_net_ptr->getWarmupEnabled());
 
                     // Send back a credit with free signal now that the
                     // VC is no longer stalled.
@@ -699,6 +704,12 @@ NetworkInterface::functionalWrite(Packet *pkt)
     return num_functional_writes;
 }
 
+int
+NetworkInterface::MachineType_base_number(const MachineType& obj)
+{
+    return m_net_ptr->getRubySystem()->MachineType_base_number(obj);
+}
+
 } // namespace garnet
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/network/garnet/NetworkInterface.hh b/src/mem/ruby/network/garnet/NetworkInterface.hh
index d42db5ee2a..cd7bb3b171 100644
--- a/src/mem/ruby/network/garnet/NetworkInterface.hh
+++ b/src/mem/ruby/network/garnet/NetworkInterface.hh
@@ -306,6 +306,8 @@ class NetworkInterface : public ClockedObject, public Consumer
 
     InputPort *getInportForVnet(int vnet);
     OutputPort *getOutportForVnet(int vnet);
+
+    int MachineType_base_number(const MachineType& obj);
 };
 
 } // namespace garnet
diff --git a/src/mem/ruby/network/simple/PerfectSwitch.cc b/src/mem/ruby/network/simple/PerfectSwitch.cc
index 74d78e3aae..20d57f04be 100644
--- a/src/mem/ruby/network/simple/PerfectSwitch.cc
+++ b/src/mem/ruby/network/simple/PerfectSwitch.cc
@@ -268,7 +268,8 @@ PerfectSwitch::operateMessageBuffer(MessageBuffer *buffer, int vnet)
                     buffer->getIncomingLink(), vnet, outgoing, vnet);
 
             out_port.buffers[vnet]->enqueue(msg_ptr, current_time,
-                                           out_port.latency);
+                out_port.latency, m_switch->getNetPtr()->getRandomization(),
+                m_switch->getNetPtr()->getWarmupEnabled());
         }
     }
 }
diff --git a/src/mem/ruby/network/simple/Switch.hh b/src/mem/ruby/network/simple/Switch.hh
index 86abfda871..e6e22022bc 100644
--- a/src/mem/ruby/network/simple/Switch.hh
+++ b/src/mem/ruby/network/simple/Switch.hh
@@ -104,6 +104,7 @@ class Switch : public BasicRouter
 
     void print(std::ostream& out) const;
     void init_net_ptr(SimpleNetwork* net_ptr) { m_network_ptr = net_ptr; }
+    SimpleNetwork* getNetPtr() const { return m_network_ptr; }
 
     bool functionalRead(Packet *);
     bool functionalRead(Packet *, WriteMask&);
diff --git a/src/mem/ruby/network/simple/Throttle.cc b/src/mem/ruby/network/simple/Throttle.cc
index 20cebccabb..fc5649330f 100644
--- a/src/mem/ruby/network/simple/Throttle.cc
+++ b/src/mem/ruby/network/simple/Throttle.cc
@@ -199,7 +199,9 @@ Throttle::operateVnet(int vnet, int channel, int &total_bw_remaining,
             // Move the message
             in->dequeue(current_time);
             out->enqueue(msg_ptr, current_time,
-                         m_switch->cyclesToTicks(m_link_latency));
+                         m_switch->cyclesToTicks(m_link_latency),
+                         m_ruby_system->getRandomization(),
+                         m_ruby_system->getWarmupEnabled());
 
             // Count the message
             (*(throttleStats.
diff --git a/src/mem/ruby/profiler/AddressProfiler.cc b/src/mem/ruby/profiler/AddressProfiler.cc
index 05fc486c63..ce40c35a9f 100644
--- a/src/mem/ruby/profiler/AddressProfiler.cc
+++ b/src/mem/ruby/profiler/AddressProfiler.cc
@@ -34,6 +34,7 @@
 #include "base/stl_helpers.hh"
 #include "mem/ruby/profiler/Profiler.hh"
 #include "mem/ruby/protocol/RubyRequest.hh"
+#include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
 {
@@ -307,7 +308,8 @@ AddressProfiler::addTraceSample(Addr data_addr, Addr pc_addr,
         }
 
         // record data address trace info
-        data_addr = makeLineAddress(data_addr);
+        int block_size_bits = m_profiler->m_ruby_system->getBlockSizeBits();
+        data_addr = makeLineAddress(data_addr, block_size_bits);
         lookupTraceForAddress(data_addr, m_dataAccessTrace).
             update(type, access_mode, id, sharing_miss);
 
diff --git a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
index ca606a5921..43fb96c375 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
@@ -95,7 +95,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
   }
 
   TBETable TBEs, template="<SQC_TBE>", constructor="m_number_of_TBEs";
-  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int TCC_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   void set_cache_entry(AbstractCacheEntry b);
   void unset_cache_entry();
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 5d98a73041..d1e1ffb7b0 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -121,7 +121,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
   }
 
   TBETable TBEs, template="<TCP_TBE>", constructor="m_number_of_TBEs";
-  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int TCC_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
   int WTcnt, default="0";
   int Fcnt, default="0";
   bool inFlush, default="false";
diff --git a/src/mem/ruby/protocol/MESI_Three_Level-L1cache.sm b/src/mem/ruby/protocol/MESI_Three_Level-L1cache.sm
index bcf99ff362..ed5e40cfa1 100644
--- a/src/mem/ruby/protocol/MESI_Three_Level-L1cache.sm
+++ b/src/mem/ruby/protocol/MESI_Three_Level-L1cache.sm
@@ -167,7 +167,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
 
   TBETable TBEs, template="<L1Cache_TBE>", constructor="m_number_of_TBEs";
 
-  int l2_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int l2_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Cycles ticksToCycles(Tick t);
diff --git a/src/mem/ruby/protocol/MESI_Two_Level-L1cache.sm b/src/mem/ruby/protocol/MESI_Two_Level-L1cache.sm
index 2b5935dee5..29f6d8e87d 100644
--- a/src/mem/ruby/protocol/MESI_Two_Level-L1cache.sm
+++ b/src/mem/ruby/protocol/MESI_Two_Level-L1cache.sm
@@ -167,7 +167,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
 
   TBETable TBEs, template="<L1Cache_TBE>", constructor="m_number_of_TBEs";
 
-  int l2_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int l2_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Cycles ticksToCycles(Tick t);
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-RegionBuffer.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-RegionBuffer.sm
index 5d85ad2fc6..bac7fd1b12 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-RegionBuffer.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-RegionBuffer.sm
@@ -181,7 +181,7 @@ machine(MachineType:RegionBuffer, "Region Buffer for AMD_Base-like protocol")
 
   // Stores only region addresses
   TBETable TBEs, template="<RegionBuffer_TBE>", constructor="m_number_of_TBEs";
-  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int TCC_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Tick cyclesToTicks(Cycles c);
@@ -195,8 +195,8 @@ machine(MachineType:RegionBuffer, "Region Buffer for AMD_Base-like protocol")
   Cycles curCycle();
   MachineID mapAddressToMachine(Addr addr, MachineType mtype);
 
-  int blockBits,  default="RubySystem::getBlockSizeBits()";
-  int blockBytes, default="RubySystem::getBlockSizeBytes()";
+  int blockBits,  default="m_ruby_system->getBlockSizeBits()";
+  int blockBytes, default="m_ruby_system->getBlockSizeBytes()";
   int regionBits, default="log2(m_blocksPerRegion)";
 
   // Functions
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-RegionDir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-RegionDir.sm
index 2464e038ff..3f1ba2540f 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-RegionDir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-RegionDir.sm
@@ -155,7 +155,7 @@ machine(MachineType:RegionDir, "Region Directory for AMD_Base-like protocol")
 
   // Stores only region addresses
   TBETable TBEs, template="<RegionDir_TBE>", constructor="m_number_of_TBEs";
-  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int TCC_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Tick cyclesToTicks(Cycles c);
@@ -169,8 +169,8 @@ machine(MachineType:RegionDir, "Region Directory for AMD_Base-like protocol")
   Cycles curCycle();
   MachineID mapAddressToMachine(Addr addr, MachineType mtype);
 
-  int blockBits,  default="RubySystem::getBlockSizeBits()";
-  int blockBytes, default="RubySystem::getBlockSizeBytes()";
+  int blockBits,  default="m_ruby_system->getBlockSizeBits()";
+  int blockBytes, default="m_ruby_system->getBlockSizeBytes()";
   int regionBits, default="log2(m_blocksPerRegion)";
 
   // Functions
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
index 17a92f5f90..5b5ab3148a 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -183,7 +183,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
 
   TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
 
-  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int TCC_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Tick cyclesToTicks(Cycles c);
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-probeFilter.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-probeFilter.sm
index 4e9e9597aa..b53ebe8ee2 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-probeFilter.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-probeFilter.sm
@@ -192,7 +192,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
 
   TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
 
-  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int TCC_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Tick cyclesToTicks(Cycles c);
diff --git a/src/mem/ruby/protocol/MOESI_CMP_directory-dir.sm b/src/mem/ruby/protocol/MOESI_CMP_directory-dir.sm
index 4a513d6d3f..b6410d12e7 100644
--- a/src/mem/ruby/protocol/MOESI_CMP_directory-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_CMP_directory-dir.sm
@@ -143,7 +143,7 @@ machine(MachineType:Directory, "Directory protocol")
     bool isPresent(Addr);
   }
 
-  int blockSize, default="RubySystem::getBlockSizeBytes()";
+  int blockSize, default="m_ruby_system->getBlockSizeBytes()";
 
   // ** OBJECTS **
   TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
diff --git a/src/mem/ruby/protocol/MOESI_CMP_token-L1cache.sm b/src/mem/ruby/protocol/MOESI_CMP_token-L1cache.sm
index 865fce4e3c..24f8146a02 100644
--- a/src/mem/ruby/protocol/MOESI_CMP_token-L1cache.sm
+++ b/src/mem/ruby/protocol/MOESI_CMP_token-L1cache.sm
@@ -198,7 +198,7 @@ machine(MachineType:L1Cache, "Token protocol")
   TBETable L1_TBEs, template="<L1Cache_TBE>", constructor="m_number_of_TBEs";
 
   bool starving, default="false";
-  int l2_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int l2_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   PersistentTable persistentTable;
   TimerTable useTimerTable;
diff --git a/src/mem/ruby/protocol/MOESI_CMP_token-dir.sm b/src/mem/ruby/protocol/MOESI_CMP_token-dir.sm
index 7f2bdf94e0..8d035a61bb 100644
--- a/src/mem/ruby/protocol/MOESI_CMP_token-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_CMP_token-dir.sm
@@ -171,7 +171,7 @@ machine(MachineType:Directory, "Token protocol")
   TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
 
   bool starving, default="false";
-  int l2_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int l2_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Tick clockEdge(Cycles c);
diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm b/src/mem/ruby/protocol/RubySlicc_Exports.sm
index 8f0341f328..97770e3516 100644
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -72,6 +72,8 @@ structure(WriteMask, external="yes", desc="...") {
   int count();
   int count(int);
   bool test(int);
+  int getBlockSize();
+  void setBlockSize(int);
 }
 
 structure(DataBlock, external = "yes", desc="..."){
diff --git a/src/mem/ruby/protocol/RubySlicc_MemControl.sm b/src/mem/ruby/protocol/RubySlicc_MemControl.sm
index 012b169dea..848ada4d12 100644
--- a/src/mem/ruby/protocol/RubySlicc_MemControl.sm
+++ b/src/mem/ruby/protocol/RubySlicc_MemControl.sm
@@ -89,7 +89,9 @@ structure(MemoryMsg, desc="...", interface="Message") {
     if ((MessageSize == MessageSizeType:Response_Data) ||
         (MessageSize == MessageSizeType:Writeback_Data))  {
       WriteMask read_mask;
-      read_mask.setMask(addressOffset(addr, makeLineAddress(addr)), Len, true);
+      read_mask.setBlockSize(mask.getBlockSize());
+      read_mask.setMask(addressOffset(addr,
+        makeLineAddress(addr, mask.getBlockSize())), Len, true);
       if (MessageSize != MessageSizeType:Writeback_Data) {
         read_mask.setInvertedMask(mask);
       }
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm
index 4e0e4f4511..848d16491d 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -94,7 +94,7 @@ structure (Set, external = "yes", non_obj="yes") {
   NodeID smallestElement();
 }
 
-structure (NetDest, external = "yes", non_obj="yes") {
+structure (NetDest, external = "yes", non_obj="yes", implicit_ctor="m_ruby_system") {
   void setSize(int);
   void setSize(int, int);
   void add(NodeID);
diff --git a/src/mem/ruby/protocol/RubySlicc_Util.sm b/src/mem/ruby/protocol/RubySlicc_Util.sm
index 104c7c034c..93976bc4e1 100644
--- a/src/mem/ruby/protocol/RubySlicc_Util.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Util.sm
@@ -52,6 +52,7 @@ Addr intToAddress(int addr);
 int addressOffset(Addr addr, Addr base);
 int max_tokens();
 Addr makeLineAddress(Addr addr);
+Addr makeLineAddress(Addr addr, int cacheLineBits);
 int getOffset(Addr addr);
 int mod(int val, int mod);
 Addr bitSelect(Addr addr, int small, int big);
diff --git a/src/mem/ruby/protocol/chi/CHI-cache.sm b/src/mem/ruby/protocol/chi/CHI-cache.sm
index dcd142ea47..a644bbe506 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache.sm
@@ -574,7 +574,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
   ////////////////////////////////////////////////////////////////////////////
 
   // Cache block size
-  int blockSize, default="RubySystem::getBlockSizeBytes()";
+  int blockSize, default="m_ruby_system->getBlockSizeBytes()";
 
   // CacheEntry
   structure(CacheEntry, interface="AbstractCacheEntry") {
diff --git a/src/mem/ruby/protocol/chi/CHI-dvm-misc-node.sm b/src/mem/ruby/protocol/chi/CHI-dvm-misc-node.sm
index aa27c40964..f7616e9ec4 100644
--- a/src/mem/ruby/protocol/chi/CHI-dvm-misc-node.sm
+++ b/src/mem/ruby/protocol/chi/CHI-dvm-misc-node.sm
@@ -192,7 +192,7 @@ machine(MachineType:MiscNode, "CHI Misc Node for handling and distrbuting DVM op
   ////////////////////////////////////////////////////////////////////////////
 
   // Cache block size
-  int blockSize, default="RubySystem::getBlockSizeBytes()";
+  int blockSize, default="m_ruby_system->getBlockSizeBytes()";
 
   // Helper class for tracking expected response and data messages
   structure(ExpectedMap, external ="yes") {
diff --git a/src/mem/ruby/protocol/chi/CHI-mem.sm b/src/mem/ruby/protocol/chi/CHI-mem.sm
index 46f57456a5..58f22d2007 100644
--- a/src/mem/ruby/protocol/chi/CHI-mem.sm
+++ b/src/mem/ruby/protocol/chi/CHI-mem.sm
@@ -157,7 +157,7 @@ machine(MachineType:Memory, "Memory controller interface") :
   ////////////////////////////////////////////////////////////////////////////
 
   // Cache block size
-  int blockSize, default="RubySystem::getBlockSizeBytes()";
+  int blockSize, default="m_ruby_system->getBlockSizeBytes()";
 
   // TBE fields
   structure(TBE, desc="...") {
diff --git a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
index 0e00a60c28..1305deddce 100644
--- a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
+++ b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
@@ -59,6 +59,8 @@ namespace gem5
 namespace ruby
 {
 
+class RubySystem;
+
 class AbstractCacheEntry : public ReplaceableEntry
 {
   private:
@@ -78,16 +80,15 @@ class AbstractCacheEntry : public ReplaceableEntry
 
     // The methods below are those called by ruby runtime, add when it
     // is absolutely necessary and should all be virtual function.
-    virtual DataBlock&
+    [[noreturn]] virtual DataBlock&
     getDataBlk()
     {
         panic("getDataBlk() not implemented!");
-
-        // Dummy return to appease the compiler
-        static DataBlock b;
-        return b;
     }
 
+    virtual void initBlockSize(int block_size) { };
+    virtual void setRubySystem(RubySystem *rs) { };
+
     int validBlocks;
     virtual int& getNumValidBlocks()
     {
diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc
index 36092387ac..0bcc662629 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.cc
+++ b/src/mem/ruby/slicc_interface/AbstractController.cc
@@ -89,6 +89,9 @@ AbstractController::init()
         getMemReqQueue()->setConsumer(this);
     }
 
+    downstreamDestinations.setRubySystem(m_ruby_system);
+    upstreamDestinations.setRubySystem(m_ruby_system);
+
     // Initialize the addr->downstream machine mappings. Multiple machines
     // in downstream_destinations can have the same address range if they have
     // different types. If this is the case, mapAddressToDownstreamMachine
@@ -268,7 +271,7 @@ AbstractController::serviceMemoryQueue()
     }
 
     const MemoryMsg *mem_msg = (const MemoryMsg*)mem_queue->peek();
-    unsigned int req_size = RubySystem::getBlockSizeBytes();
+    unsigned int req_size = m_ruby_system->getBlockSizeBytes();
     if (mem_msg->m_Len > 0) {
         req_size = mem_msg->m_Len;
     }
@@ -294,7 +297,7 @@ AbstractController::serviceMemoryQueue()
     SenderState *s = new SenderState(mem_msg->m_Sender);
     pkt->pushSenderState(s);
 
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         // Use functional rather than timing accesses during warmup
         mem_queue->dequeue(clockEdge());
         memoryPort.sendFunctional(pkt);
@@ -382,7 +385,10 @@ AbstractController::recvTimingResp(PacketPtr pkt)
         return false;
     }
 
-    std::shared_ptr<MemoryMsg> msg = std::make_shared<MemoryMsg>(clockEdge());
+    int blk_size = m_ruby_system->getBlockSizeBytes();
+
+    std::shared_ptr<MemoryMsg> msg =
+        std::make_shared<MemoryMsg>(clockEdge(), blk_size, m_ruby_system);
     (*msg).m_addr = pkt->getAddr();
     (*msg).m_Sender = m_machineID;
 
@@ -396,7 +402,7 @@ AbstractController::recvTimingResp(PacketPtr pkt)
 
         // Copy data from the packet
         (*msg).m_DataBlk.setData(pkt->getPtr<uint8_t>(), 0,
-                                 RubySystem::getBlockSizeBytes());
+                                 m_ruby_system->getBlockSizeBytes());
     } else if (pkt->isWrite()) {
         (*msg).m_Type = MemoryRequestType_MEMORY_WB;
         (*msg).m_MessageSize = MessageSizeType_Writeback_Control;
@@ -404,7 +410,8 @@ AbstractController::recvTimingResp(PacketPtr pkt)
         panic("Incorrect packet type received from memory controller!");
     }
 
-    memRspQueue->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1)));
+    memRspQueue->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1)),
+        m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());
     delete pkt;
     return true;
 }
@@ -471,6 +478,45 @@ AbstractController::sendRetryRespToMem() {
     }
 }
 
+Addr
+AbstractController::getOffset(Addr addr) const
+{
+    return ruby::getOffset(addr, m_ruby_system->getBlockSizeBits());
+}
+
+Addr
+AbstractController::makeLineAddress(Addr addr) const
+{
+    return ruby::makeLineAddress(addr, m_ruby_system->getBlockSizeBits());
+}
+
+std::string
+AbstractController::printAddress(Addr addr) const
+{
+    return ruby::printAddress(addr, m_ruby_system->getBlockSizeBits());
+}
+
+NetDest
+AbstractController::broadcast(MachineType type)
+{
+    assert(m_ruby_system != nullptr);
+    NodeID type_count = m_ruby_system->MachineType_base_count(type);
+
+    NetDest dest;
+    for (NodeID i = 0; i < type_count; i++) {
+        MachineID mach = {type, i};
+        dest.add(mach);
+    }
+    return dest;
+}
+
+int
+AbstractController::machineCount(MachineType machType)
+{
+    assert(m_ruby_system != nullptr);
+    return m_ruby_system->MachineType_base_count(machType);
+}
+
 bool
 AbstractController::MemoryPort::recvTimingResp(PacketPtr pkt)
 {
diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh
index ce6a6972af..79f67073a6 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.hh
+++ b/src/mem/ruby/slicc_interface/AbstractController.hh
@@ -72,6 +72,7 @@ namespace ruby
 class Network;
 class GPUCoalescer;
 class DMASequencer;
+class RubySystem;
 
 // used to communicate that an in_port peeked the wrong message type
 class RejectException: public std::exception
@@ -229,6 +230,11 @@ class AbstractController : public ClockedObject, public Consumer
     /** List of upstream destinations (towards the CPU) */
     const NetDest& allUpstreamDest() const { return upstreamDestinations; }
 
+    // Helper methods for commonly used functions called in common/address.hh
+    Addr getOffset(Addr addr) const;
+    Addr makeLineAddress(Addr addr) const;
+    std::string printAddress(Addr addr) const;
+
   protected:
     //! Profiles original cache requests including PUTs
     void profileRequest(const std::string &request);
@@ -452,6 +458,13 @@ class AbstractController : public ClockedObject, public Consumer
         {}
     };
 
+    RubySystem *m_ruby_system = nullptr;
+
+    // Formerly in RubySlicc_ComponentMapping.hh. Moved here to access
+    // RubySystem pointer.
+    NetDest broadcast(MachineType type);
+    int machineCount(MachineType machType);
+
   private:
     /** The address range to which the controller responds on the CPU side. */
     const AddrRangeList addrRanges;
diff --git a/src/mem/ruby/slicc_interface/Message.hh b/src/mem/ruby/slicc_interface/Message.hh
index 5c824c4a38..31fb5e8e92 100644
--- a/src/mem/ruby/slicc_interface/Message.hh
+++ b/src/mem/ruby/slicc_interface/Message.hh
@@ -62,10 +62,12 @@ typedef std::shared_ptr<Message> MsgPtr;
 class Message
 {
   public:
-    Message(Tick curTime)
-        : m_time(curTime),
+    Message(Tick curTime, int block_size, const RubySystem *rs)
+        : m_block_size(block_size),
+          m_time(curTime),
           m_LastEnqueueTime(curTime),
-          m_DelayedTicks(0), m_msg_counter(0)
+          m_DelayedTicks(0), m_msg_counter(0),
+          p_ruby_system(rs)
     { }
 
     Message(const Message &other) = default;
@@ -121,6 +123,9 @@ class Message
     int getVnet() const { return vnet; }
     void setVnet(int net) { vnet = net; }
 
+  protected:
+    int m_block_size = 0;
+
   private:
     Tick m_time;
     Tick m_LastEnqueueTime; // my last enqueue time
@@ -130,6 +135,9 @@ class Message
     // Variables for required network traversal
     int incoming_link;
     int vnet;
+
+    // Needed to call MacheinType_base_count/level
+    const RubySystem *p_ruby_system = nullptr;
 };
 
 inline bool
diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh
index a258a18f9a..58eae229be 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -86,11 +86,12 @@ class RubyRequest : public Message
     bool m_isSLCSet;
     bool m_isSecure;
 
-    RubyRequest(Tick curTime, uint64_t _paddr, int _len,
+    RubyRequest(Tick curTime, int block_size, RubySystem *rs,
+        uint64_t _paddr, int _len,
         uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
         PacketPtr _pkt, PrefetchBit _pb = PrefetchBit_No,
         ContextID _proc_id = 100, ContextID _core_id = 99)
-        : Message(curTime),
+        : Message(curTime, block_size, rs),
           m_PhysicalAddress(_paddr),
           m_Type(_type),
           m_ProgramCounter(_pc),
@@ -99,13 +100,16 @@ class RubyRequest : public Message
           m_Prefetch(_pb),
           m_pkt(_pkt),
           m_contextId(_core_id),
+          m_writeMask(block_size),
+          m_WTData(block_size),
           m_htmFromTransaction(false),
           m_htmTransactionUid(0),
           m_isTlbi(false),
           m_tlbiTransactionUid(0),
           m_isSecure(m_pkt ? m_pkt->req->isSecure() : false)
     {
-        m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        int block_size_bits = floorLog2(block_size);
+        m_LineAddress = makeLineAddress(m_PhysicalAddress, block_size_bits);
         if (_pkt) {
             m_isGLCSet = m_pkt->req->isGLCSet();
             m_isSLCSet = m_pkt->req->isSLCSet();
@@ -116,10 +120,10 @@ class RubyRequest : public Message
     }
 
     /** RubyRequest for memory management commands */
-    RubyRequest(Tick curTime,
+    RubyRequest(Tick curTime, int block_size, RubySystem *rs,
         uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
         PacketPtr _pkt, ContextID _proc_id, ContextID _core_id)
-        : Message(curTime),
+        : Message(curTime, block_size, rs),
           m_PhysicalAddress(0),
           m_Type(_type),
           m_ProgramCounter(_pc),
@@ -128,6 +132,8 @@ class RubyRequest : public Message
           m_Prefetch(PrefetchBit_No),
           m_pkt(_pkt),
           m_contextId(_core_id),
+          m_writeMask(block_size),
+          m_WTData(block_size),
           m_htmFromTransaction(false),
           m_htmTransactionUid(0),
           m_isTlbi(false),
@@ -144,14 +150,14 @@ class RubyRequest : public Message
         }
     }
 
-    RubyRequest(Tick curTime, uint64_t _paddr, int _len,
-        uint64_t _pc, RubyRequestType _type,
+    RubyRequest(Tick curTime, int block_size, RubySystem *rs,
+        uint64_t _paddr, int _len, uint64_t _pc, RubyRequestType _type,
         RubyAccessMode _access_mode, PacketPtr _pkt, PrefetchBit _pb,
         unsigned _proc_id, unsigned _core_id,
         int _wm_size, std::vector<bool> & _wm_mask,
         DataBlock & _Data,
         uint64_t _instSeqNum = 0)
-        : Message(curTime),
+        : Message(curTime, block_size, rs),
           m_PhysicalAddress(_paddr),
           m_Type(_type),
           m_ProgramCounter(_pc),
@@ -170,7 +176,8 @@ class RubyRequest : public Message
           m_tlbiTransactionUid(0),
           m_isSecure(m_pkt->req->isSecure())
     {
-        m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        int block_size_bits = floorLog2(block_size);
+        m_LineAddress = makeLineAddress(m_PhysicalAddress, block_size_bits);
         if (_pkt) {
             m_isGLCSet = m_pkt->req->isGLCSet();
             m_isSLCSet = m_pkt->req->isSLCSet();
@@ -180,15 +187,15 @@ class RubyRequest : public Message
         }
     }
 
-    RubyRequest(Tick curTime, uint64_t _paddr, int _len,
-        uint64_t _pc, RubyRequestType _type,
+    RubyRequest(Tick curTime, int block_size, RubySystem *rs,
+        uint64_t _paddr, int _len, uint64_t _pc, RubyRequestType _type,
         RubyAccessMode _access_mode, PacketPtr _pkt, PrefetchBit _pb,
         unsigned _proc_id, unsigned _core_id,
         int _wm_size, std::vector<bool> & _wm_mask,
         DataBlock & _Data,
         std::vector< std::pair<int,AtomicOpFunctor*> > _atomicOps,
         uint64_t _instSeqNum = 0)
-        : Message(curTime),
+        : Message(curTime, block_size, rs),
           m_PhysicalAddress(_paddr),
           m_Type(_type),
           m_ProgramCounter(_pc),
@@ -207,7 +214,8 @@ class RubyRequest : public Message
           m_tlbiTransactionUid(0),
           m_isSecure(m_pkt->req->isSecure())
     {
-        m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        int block_size_bits = floorLog2(block_size);
+        m_LineAddress = makeLineAddress(m_PhysicalAddress, block_size_bits);
         if (_pkt) {
             m_isGLCSet = m_pkt->req->isGLCSet();
             m_isSLCSet = m_pkt->req->isSLCSet();
@@ -218,7 +226,12 @@ class RubyRequest : public Message
         }
     }
 
-    RubyRequest(Tick curTime) : Message(curTime) {}
+    RubyRequest(Tick curTime, int block_size, RubySystem *rs)
+        : Message(curTime, block_size, rs),
+          m_writeMask(block_size),
+          m_WTData(block_size)
+    {
+    }
     MsgPtr clone() const
     { return std::shared_ptr<Message>(new RubyRequest(*this)); }
 
diff --git a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
index 9a433d1cee..1195089fc3 100644
--- a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
+++ b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
@@ -41,17 +41,6 @@ namespace gem5
 namespace ruby
 {
 
-inline NetDest
-broadcast(MachineType type)
-{
-    NetDest dest;
-    for (NodeID i = 0; i < MachineType_base_count(type); i++) {
-        MachineID mach = {type, i};
-        dest.add(mach);
-    }
-    return dest;
-}
-
 inline MachineID
 mapAddressToRange(Addr addr, MachineType type, int low_bit,
                   int num_bits, int cluster_id = 0)
@@ -77,12 +66,6 @@ machineIDToMachineType(MachineID machID)
     return machID.type;
 }
 
-inline int
-machineCount(MachineType machType)
-{
-    return MachineType_base_count(machType);
-}
-
 inline MachineID
 createMachineID(MachineType type, NodeID id)
 {
diff --git a/src/mem/ruby/slicc_interface/RubySlicc_Util.hh b/src/mem/ruby/slicc_interface/RubySlicc_Util.hh
index 8df56c7013..f4a49463a8 100644
--- a/src/mem/ruby/slicc_interface/RubySlicc_Util.hh
+++ b/src/mem/ruby/slicc_interface/RubySlicc_Util.hh
@@ -233,8 +233,9 @@ addressOffset(Addr addr, Addr base)
 inline bool
 testAndRead(Addr addr, DataBlock& blk, Packet *pkt)
 {
-    Addr pktLineAddr = makeLineAddress(pkt->getAddr());
-    Addr lineAddr = makeLineAddress(addr);
+    int block_size_bits = floorLog2(blk.getBlockSize());
+    Addr pktLineAddr = makeLineAddress(pkt->getAddr(), block_size_bits);
+    Addr lineAddr = makeLineAddress(addr, block_size_bits);
 
     if (pktLineAddr == lineAddr) {
         uint8_t *data = pkt->getPtr<uint8_t>();
@@ -259,8 +260,10 @@ testAndRead(Addr addr, DataBlock& blk, Packet *pkt)
 inline bool
 testAndReadMask(Addr addr, DataBlock& blk, WriteMask& mask, Packet *pkt)
 {
-    Addr pktLineAddr = makeLineAddress(pkt->getAddr());
-    Addr lineAddr = makeLineAddress(addr);
+    assert(blk.getBlockSize() == mask.getBlockSize());
+    int block_size_bits = floorLog2(blk.getBlockSize());
+    Addr pktLineAddr = makeLineAddress(pkt->getAddr(), block_size_bits);
+    Addr lineAddr = makeLineAddress(addr, block_size_bits);
 
     if (pktLineAddr == lineAddr) {
         uint8_t *data = pkt->getPtr<uint8_t>();
@@ -288,8 +291,9 @@ testAndReadMask(Addr addr, DataBlock& blk, WriteMask& mask, Packet *pkt)
 inline bool
 testAndWrite(Addr addr, DataBlock& blk, Packet *pkt)
 {
-    Addr pktLineAddr = makeLineAddress(pkt->getAddr());
-    Addr lineAddr = makeLineAddress(addr);
+    int block_size_bits = floorLog2(blk.getBlockSize());
+    Addr pktLineAddr = makeLineAddress(pkt->getAddr(), block_size_bits);
+    Addr lineAddr = makeLineAddress(addr, block_size_bits);
 
     if (pktLineAddr == lineAddr) {
         const uint8_t *data = pkt->getConstPtr<uint8_t>();
diff --git a/src/mem/ruby/structures/ALUFreeListArray.cc b/src/mem/ruby/structures/ALUFreeListArray.cc
index 87b5cbfbd2..3e25e5b599 100644
--- a/src/mem/ruby/structures/ALUFreeListArray.cc
+++ b/src/mem/ruby/structures/ALUFreeListArray.cc
@@ -57,10 +57,10 @@ namespace ruby
 *       - The same line has been accessed in the past accessLatency ticks
 */
 
-ALUFreeListArray::ALUFreeListArray(unsigned int num_ALUs, Tick access_latency)
+ALUFreeListArray::ALUFreeListArray(unsigned int num_ALUs, Cycles access_clocks)
 {
     this->numALUs = num_ALUs;
-    this->accessLatency = access_latency;
+    this->accessClocks = access_clocks;
 }
 
 bool ALUFreeListArray::tryAccess(Addr addr)
@@ -85,7 +85,7 @@ bool ALUFreeListArray::tryAccess(Addr addr)
         }
 
         // Block access if the line is already being used
-        if (record.lineAddr == makeLineAddress(addr)) {
+        if (record.lineAddr == makeLineAddress(addr, m_block_size_bits)) {
             return false;
         }
     }
@@ -99,7 +99,9 @@ void ALUFreeListArray::reserve(Addr addr)
     // the access is valid
 
     // Add record to queue
-    accessQueue.push_front(AccessRecord(makeLineAddress(addr), curTick()));
+    accessQueue.push_front(
+        AccessRecord(makeLineAddress(addr, m_block_size_bits), curTick())
+    );
 }
 
 } // namespace ruby
diff --git a/src/mem/ruby/structures/ALUFreeListArray.hh b/src/mem/ruby/structures/ALUFreeListArray.hh
index bed1b00b5c..5c4fdd95f9 100644
--- a/src/mem/ruby/structures/ALUFreeListArray.hh
+++ b/src/mem/ruby/structures/ALUFreeListArray.hh
@@ -32,6 +32,7 @@
 
 #include <deque>
 
+#include "base/intmath.hh"
 #include "mem/ruby/common/TypeDefines.hh"
 #include "sim/cur_tick.hh"
 
@@ -45,7 +46,8 @@ class ALUFreeListArray
 {
   private:
     unsigned int numALUs;
-    Tick accessLatency;
+    Cycles accessClocks;
+    Tick accessLatency = 0;
 
     class AccessRecord
     {
@@ -62,14 +64,33 @@ class ALUFreeListArray
     // Queue of accesses from past accessLatency cycles
     std::deque<AccessRecord> accessQueue;
 
+    int m_block_size_bits = 0;
+
   public:
-    ALUFreeListArray(unsigned int num_ALUs, Tick access_latency);
+    ALUFreeListArray(unsigned int num_ALUs, Cycles access_clocks);
 
     bool tryAccess(Addr addr);
 
     void reserve(Addr addr);
 
-    Tick getLatency() const { return accessLatency; }
+    Tick
+    getLatency() const
+    {
+        assert(accessLatency > 0);
+        return accessLatency;
+    }
+
+    void
+    setClockPeriod(Tick clockPeriod)
+    {
+        accessLatency = accessClocks * clockPeriod;
+    }
+
+    void
+    setBlockSize(int block_size)
+    {
+        m_block_size_bits = floorLog2(block_size);
+    }
 };
 
 } // namespace ruby
diff --git a/src/mem/ruby/structures/BankedArray.cc b/src/mem/ruby/structures/BankedArray.cc
index 0f01d5c396..2c2202dec5 100644
--- a/src/mem/ruby/structures/BankedArray.cc
+++ b/src/mem/ruby/structures/BankedArray.cc
@@ -42,8 +42,7 @@ namespace ruby
 {
 
 BankedArray::BankedArray(unsigned int banks, Cycles accessLatency,
-                         unsigned int startIndexBit, RubySystem *rs)
-    : m_ruby_system(rs)
+                         unsigned int startIndexBit)
 {
     this->banks = banks;
     this->accessLatency = accessLatency;
@@ -78,6 +77,8 @@ BankedArray::reserve(int64_t idx)
     if (accessLatency == 0)
         return;
 
+    assert(clockPeriod > 0);
+
     unsigned int bank = mapIndexToBank(idx);
     assert(bank < banks);
 
@@ -95,7 +96,7 @@ BankedArray::reserve(int64_t idx)
     busyBanks[bank].idx = idx;
     busyBanks[bank].startAccess = curTick();
     busyBanks[bank].endAccess = curTick() +
-        (accessLatency-1) * m_ruby_system->clockPeriod();
+        (accessLatency-1) * clockPeriod;
 }
 
 unsigned int
diff --git a/src/mem/ruby/structures/BankedArray.hh b/src/mem/ruby/structures/BankedArray.hh
index c757759296..ecc984a617 100644
--- a/src/mem/ruby/structures/BankedArray.hh
+++ b/src/mem/ruby/structures/BankedArray.hh
@@ -48,6 +48,7 @@ class BankedArray
   private:
     unsigned int banks;
     Cycles accessLatency;
+    Tick clockPeriod = 0;
     unsigned int bankBits;
     unsigned int startIndexBit;
     RubySystem *m_ruby_system;
@@ -69,7 +70,7 @@ class BankedArray
 
   public:
     BankedArray(unsigned int banks, Cycles accessLatency,
-                unsigned int startIndexBit, RubySystem *rs);
+                unsigned int startIndexBit);
 
     // Note: We try the access based on the cache index, not the address
     // This is so we don't get aliasing on blocks being replaced
@@ -78,6 +79,8 @@ class BankedArray
     void reserve(int64_t idx);
 
     Cycles getLatency() const { return accessLatency; }
+
+    void setClockPeriod(Tick _clockPeriod) { clockPeriod = _clockPeriod; }
 };
 
 } // namespace ruby
diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc
index 90d67fb29b..6bc35bac7d 100644
--- a/src/mem/ruby/structures/CacheMemory.cc
+++ b/src/mem/ruby/structures/CacheMemory.cc
@@ -69,12 +69,9 @@ operator<<(std::ostream& out, const CacheMemory& obj)
 
 CacheMemory::CacheMemory(const Params &p)
     : SimObject(p),
-    dataArray(p.dataArrayBanks, p.dataAccessLatency,
-              p.start_index_bit, p.ruby_system),
-    tagArray(p.tagArrayBanks, p.tagAccessLatency,
-             p.start_index_bit, p.ruby_system),
-    atomicALUArray(p.atomicALUs, p.atomicLatency *
-             p.ruby_system->clockPeriod()),
+    dataArray(p.dataArrayBanks, p.dataAccessLatency, p.start_index_bit),
+    tagArray(p.tagArrayBanks, p.tagAccessLatency, p.start_index_bit),
+    atomicALUArray(p.atomicALUs, p.atomicLatency),
     cacheMemoryStats(this)
 {
     m_cache_size = p.size;
@@ -88,12 +85,25 @@ CacheMemory::CacheMemory(const Params &p)
                                     m_replacementPolicy_ptr) ? true : false;
 }
 
+void
+CacheMemory::setRubySystem(RubySystem* rs)
+{
+    dataArray.setClockPeriod(rs->clockPeriod());
+    tagArray.setClockPeriod(rs->clockPeriod());
+    atomicALUArray.setClockPeriod(rs->clockPeriod());
+    atomicALUArray.setBlockSize(rs->getBlockSizeBytes());
+
+    if (m_block_size == 0) {
+        m_block_size = rs->getBlockSizeBytes();
+    }
+
+    m_ruby_system = rs;
+}
+
 void
 CacheMemory::init()
 {
-    if (m_block_size == 0) {
-        m_block_size = RubySystem::getBlockSizeBytes();
-    }
+    assert(m_block_size != 0);
     m_cache_num_sets = (m_cache_size / m_cache_assoc) / m_block_size;
     assert(m_cache_num_sets > 1);
     m_cache_num_set_bits = floorLog2(m_cache_num_sets);
@@ -286,6 +296,9 @@ CacheMemory::allocate(Addr address, AbstractCacheEntry *entry)
     assert(cacheAvail(address));
     DPRINTF(RubyCache, "allocating address: %#x\n", address);
 
+    entry->initBlockSize(m_block_size);
+    entry->setRubySystem(m_ruby_system);
+
     // Find the first open slot
     int64_t cacheSet = addressToCacheSet(address);
     std::vector<AbstractCacheEntry*> &set = m_cache[cacheSet];
diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh
index de7c327f63..912ae22d1f 100644
--- a/src/mem/ruby/structures/CacheMemory.hh
+++ b/src/mem/ruby/structures/CacheMemory.hh
@@ -154,6 +154,8 @@ class CacheMemory : public SimObject
     void htmAbortTransaction();
     void htmCommitTransaction();
 
+    void setRubySystem(RubySystem* rs);
+
   public:
     int getCacheSize() const { return m_cache_size; }
     int getCacheAssoc() const { return m_cache_assoc; }
@@ -213,6 +215,14 @@ class CacheMemory : public SimObject
      */
     bool m_use_occupancy;
 
+    RubySystem *m_ruby_system = nullptr;
+
+    Addr
+    makeLineAddress(Addr addr) const
+    {
+        return ruby::makeLineAddress(addr, floorLog2(m_block_size));
+    }
+
     private:
       struct CacheMemoryStats : public statistics::Group
       {
diff --git a/src/mem/ruby/structures/DirectoryMemory.cc b/src/mem/ruby/structures/DirectoryMemory.cc
index 620254b82c..7469f72451 100644
--- a/src/mem/ruby/structures/DirectoryMemory.cc
+++ b/src/mem/ruby/structures/DirectoryMemory.cc
@@ -64,12 +64,14 @@ DirectoryMemory::DirectoryMemory(const Params &p)
     }
     m_size_bits = floorLog2(m_size_bytes);
     m_num_entries = 0;
+    m_block_size = p.block_size;
+    m_ruby_system = p.ruby_system;
 }
 
 void
 DirectoryMemory::init()
 {
-    m_num_entries = m_size_bytes / RubySystem::getBlockSizeBytes();
+    m_num_entries = m_size_bytes / m_block_size;
     m_entries = new AbstractCacheEntry*[m_num_entries];
     for (int i = 0; i < m_num_entries; i++)
         m_entries[i] = NULL;
@@ -108,7 +110,7 @@ DirectoryMemory::mapAddressToLocalIdx(Addr address)
         }
         ret += r.size();
     }
-    return ret >> RubySystem::getBlockSizeBits();
+    return ret >> (floorLog2(m_block_size));
 }
 
 AbstractCacheEntry*
@@ -133,6 +135,8 @@ DirectoryMemory::allocate(Addr address, AbstractCacheEntry *entry)
     assert(idx < m_num_entries);
     assert(m_entries[idx] == NULL);
     entry->changePermission(AccessPermission_Read_Only);
+    entry->initBlockSize(m_block_size);
+    entry->setRubySystem(m_ruby_system);
     m_entries[idx] = entry;
 
     return entry;
diff --git a/src/mem/ruby/structures/DirectoryMemory.hh b/src/mem/ruby/structures/DirectoryMemory.hh
index 8a4532864d..6e77e2a4ca 100644
--- a/src/mem/ruby/structures/DirectoryMemory.hh
+++ b/src/mem/ruby/structures/DirectoryMemory.hh
@@ -104,6 +104,9 @@ class DirectoryMemory : public SimObject
     uint64_t m_size_bytes;
     uint64_t m_size_bits;
     uint64_t m_num_entries;
+    uint32_t m_block_size;
+
+    RubySystem *m_ruby_system = nullptr;
 
     /**
      * The address range for which the directory responds. Normally
diff --git a/src/mem/ruby/structures/DirectoryMemory.py b/src/mem/ruby/structures/DirectoryMemory.py
index 85f05367cf..202617bceb 100644
--- a/src/mem/ruby/structures/DirectoryMemory.py
+++ b/src/mem/ruby/structures/DirectoryMemory.py
@@ -49,3 +49,7 @@ class RubyDirectoryMemory(SimObject):
     addr_ranges = VectorParam.AddrRange(
         Parent.addr_ranges, "Address range this directory responds to"
     )
+    block_size = Param.UInt32(
+        "Size of a block in bytes. Usually same as cache line size."
+    )
+    ruby_system = Param.RubySystem(Parent.any, "")
diff --git a/src/mem/ruby/structures/PerfectCacheMemory.hh b/src/mem/ruby/structures/PerfectCacheMemory.hh
index 664d10f202..0966ca80d2 100644
--- a/src/mem/ruby/structures/PerfectCacheMemory.hh
+++ b/src/mem/ruby/structures/PerfectCacheMemory.hh
@@ -74,6 +74,8 @@ class PerfectCacheMemory
   public:
     PerfectCacheMemory();
 
+    void setBlockSize(const int block_size) { m_block_size = block_size; }
+
     // tests to see if an address is present in the cache
     bool isTagPresent(Addr address) const;
 
@@ -108,6 +110,8 @@ class PerfectCacheMemory
 
     // Data Members (m_prefix)
     std::unordered_map<Addr, PerfectCacheLineState<ENTRY> > m_map;
+
+    int m_block_size = 0;
 };
 
 template<class ENTRY>
@@ -130,7 +134,7 @@ template<class ENTRY>
 inline bool
 PerfectCacheMemory<ENTRY>::isTagPresent(Addr address) const
 {
-    return m_map.count(makeLineAddress(address)) > 0;
+    return m_map.count(makeLineAddress(address, floorLog2(m_block_size))) > 0;
 }
 
 template<class ENTRY>
@@ -149,7 +153,8 @@ PerfectCacheMemory<ENTRY>::allocate(Addr address)
     PerfectCacheLineState<ENTRY> line_state;
     line_state.m_permission = AccessPermission_Invalid;
     line_state.m_entry = ENTRY();
-    m_map[makeLineAddress(address)] = line_state;
+    Addr line_addr = makeLineAddress(address, floorLog2(m_block_size));
+    m_map.emplace(line_addr, line_state);
 }
 
 // deallocate entry
@@ -157,7 +162,8 @@ template<class ENTRY>
 inline void
 PerfectCacheMemory<ENTRY>::deallocate(Addr address)
 {
-    [[maybe_unused]] auto num_erased = m_map.erase(makeLineAddress(address));
+    Addr line_addr = makeLineAddress(address, floorLog2(m_block_size));
+    [[maybe_unused]] auto num_erased = m_map.erase(line_addr);
     assert(num_erased == 1);
 }
 
@@ -175,7 +181,8 @@ template<class ENTRY>
 inline ENTRY*
 PerfectCacheMemory<ENTRY>::lookup(Addr address)
 {
-    return &m_map[makeLineAddress(address)].m_entry;
+    Addr line_addr = makeLineAddress(address, floorLog2(m_block_size));
+    return &m_map[line_addr].m_entry;
 }
 
 // looks an address up in the cache
@@ -183,14 +190,16 @@ template<class ENTRY>
 inline const ENTRY*
 PerfectCacheMemory<ENTRY>::lookup(Addr address) const
 {
-    return &m_map[makeLineAddress(address)].m_entry;
+    Addr line_addr = makeLineAddress(address, floorLog2(m_block_size));
+    return &m_map[line_addr].m_entry;
 }
 
 template<class ENTRY>
 inline AccessPermission
 PerfectCacheMemory<ENTRY>::getPermission(Addr address) const
 {
-    return m_map[makeLineAddress(address)].m_permission;
+    Addr line_addr = makeLineAddress(address, floorLog2(m_block_size));
+    return m_map[line_addr].m_permission;
 }
 
 template<class ENTRY>
@@ -198,8 +207,8 @@ inline void
 PerfectCacheMemory<ENTRY>::changePermission(Addr address,
                                             AccessPermission new_perm)
 {
-    Addr line_address = makeLineAddress(address);
-    PerfectCacheLineState<ENTRY>& line_state = m_map[line_address];
+    Addr line_addr = makeLineAddress(address, floorLog2(m_block_size));
+    PerfectCacheLineState<ENTRY>& line_state = m_map[line_addr];
     line_state.m_permission = new_perm;
 }
 
diff --git a/src/mem/ruby/structures/PersistentTable.hh b/src/mem/ruby/structures/PersistentTable.hh
index 5382269273..1162e1dda1 100644
--- a/src/mem/ruby/structures/PersistentTable.hh
+++ b/src/mem/ruby/structures/PersistentTable.hh
@@ -63,6 +63,12 @@ class PersistentTable
     // Destructor
     ~PersistentTable();
 
+    void
+    setBlockSize(int block_size)
+    {
+        m_block_size_bits = floorLog2(block_size);
+    }
+
     // Public Methods
     void persistentRequestLock(Addr address, MachineID locker,
                                AccessType type);
@@ -82,9 +88,17 @@ class PersistentTable
     PersistentTable(const PersistentTable& obj);
     PersistentTable& operator=(const PersistentTable& obj);
 
+    int m_block_size_bits = 0;
+
     // Data Members (m_prefix)
     typedef std::unordered_map<Addr, PersistentTableEntry> AddressMap;
     AddressMap m_map;
+
+    Addr
+    makeLineAddress(Addr addr) const
+    {
+        return ruby::makeLineAddress(addr, m_block_size_bits);
+    }
 };
 
 inline std::ostream&
diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py
index 2f457f5c4a..4b1023fc61 100644
--- a/src/mem/ruby/structures/RubyCache.py
+++ b/src/mem/ruby/structures/RubyCache.py
@@ -54,4 +54,3 @@ class RubyCache(SimObject):
     dataAccessLatency = Param.Cycles(1, "cycles for a data array access")
     tagAccessLatency = Param.Cycles(1, "cycles for a tag array access")
     resourceStalls = Param.Bool(False, "stall if there is a resource failure")
-    ruby_system = Param.RubySystem(Parent.any, "")
diff --git a/src/mem/ruby/structures/RubyPrefetcher.cc b/src/mem/ruby/structures/RubyPrefetcher.cc
index e45eff2c2f..bffcfe2327 100644
--- a/src/mem/ruby/structures/RubyPrefetcher.cc
+++ b/src/mem/ruby/structures/RubyPrefetcher.cc
@@ -56,13 +56,15 @@ namespace ruby
 
 RubyPrefetcher::RubyPrefetcher(const Params &p)
     : SimObject(p), m_num_streams(p.num_streams),
-    m_array(p.num_streams), m_train_misses(p.train_misses),
+    m_array(p.num_streams, p.block_size), m_train_misses(p.train_misses),
     m_num_startup_pfs(p.num_startup_pfs),
     unitFilter(p.unit_filter),
     negativeFilter(p.unit_filter),
     nonUnitFilter(p.nonunit_filter),
     m_prefetch_cross_pages(p.cross_page),
     pageShift(p.page_shift),
+    m_block_size_bits(floorLog2(p.block_size)),
+    m_block_size_bytes(p.block_size),
     rubyPrefetcherStats(this)
 {
     assert(m_num_streams > 0);
@@ -90,7 +92,7 @@ void
 RubyPrefetcher::observeMiss(Addr address, const RubyRequestType& type)
 {
     DPRINTF(RubyPrefetcher, "Observed miss for %#x\n", address);
-    Addr line_addr = makeLineAddress(address);
+    Addr line_addr = makeLineAddress(address, m_block_size_bits);
     rubyPrefetcherStats.numMissObserved++;
 
     // check to see if we have already issued a prefetch for this block
@@ -214,7 +216,7 @@ RubyPrefetcher::initializeStream(Addr address, int stride,
 
     // initialize the stream prefetcher
     PrefetchEntry *mystream = &(m_array[index]);
-    mystream->m_address = makeLineAddress(address);
+    mystream->m_address = makeLineAddress(address, m_block_size_bits);
     mystream->m_stride = stride;
     mystream->m_use_time = m_controller->curCycle();
     mystream->m_is_valid = true;
@@ -222,7 +224,7 @@ RubyPrefetcher::initializeStream(Addr address, int stride,
 
     // create a number of initial prefetches for this stream
     Addr page_addr = pageAddress(mystream->m_address);
-    Addr line_addr = makeLineAddress(mystream->m_address);
+    Addr line_addr = makeLineAddress(mystream->m_address, m_block_size_bits);
 
     // insert a number of prefetches into the prefetch table
     for (int k = 0; k < m_num_startup_pfs; k++) {
@@ -312,8 +314,7 @@ RubyPrefetcher::accessNonunitFilter(Addr line_addr,
                         // This stride HAS to be the multiplicative constant of
                         // dataBlockBytes (bc makeNextStrideAddress is
                         // calculated based on this multiplicative constant!)
-                        const int stride = entry.stride /
-                            RubySystem::getBlockSizeBytes();
+                        const int stride = entry.stride / m_block_size_bytes;
 
                         // clear this filter entry
                         entry.clear();
diff --git a/src/mem/ruby/structures/RubyPrefetcher.hh b/src/mem/ruby/structures/RubyPrefetcher.hh
index 51e1b3c480..5627410713 100644
--- a/src/mem/ruby/structures/RubyPrefetcher.hh
+++ b/src/mem/ruby/structures/RubyPrefetcher.hh
@@ -68,10 +68,10 @@ class PrefetchEntry
 {
     public:
         /// constructor
-        PrefetchEntry()
+        PrefetchEntry(int block_size)
         {
             // default: 1 cache-line stride
-            m_stride   = (1 << RubySystem::getBlockSizeBits());
+            m_stride   = (1 << floorLog2(block_size));
             m_use_time = Cycles(0);
             m_is_valid = false;
         }
@@ -239,6 +239,16 @@ class RubyPrefetcher : public SimObject
 
         const unsigned pageShift;
 
+        int m_block_size_bits = 0;
+        int m_block_size_bytes = 0;
+
+        Addr
+        makeNextStrideAddress(Addr addr, int stride) const
+        {
+            return ruby::makeNextStrideAddress(addr, stride,
+                                               m_block_size_bytes);
+        }
+
         struct RubyPrefetcherStats : public statistics::Group
         {
             RubyPrefetcherStats(statistics::Group *parent);
diff --git a/src/mem/ruby/structures/RubyPrefetcher.py b/src/mem/ruby/structures/RubyPrefetcher.py
index d4189ae7d5..155b7c314d 100644
--- a/src/mem/ruby/structures/RubyPrefetcher.py
+++ b/src/mem/ruby/structures/RubyPrefetcher.py
@@ -62,6 +62,9 @@ class RubyPrefetcher(SimObject):
     page_shift = Param.UInt32(
         12, "Number of bits to mask to get a page number"
     )
+    block_size = Param.UInt32(
+        "Size of block to prefetch, usually cache line size"
+    )
 
 
 class Prefetcher(RubyPrefetcher):
diff --git a/src/mem/ruby/structures/RubyPrefetcherProxy.cc b/src/mem/ruby/structures/RubyPrefetcherProxy.cc
index 2a29fbc88e..a6fed8258c 100644
--- a/src/mem/ruby/structures/RubyPrefetcherProxy.cc
+++ b/src/mem/ruby/structures/RubyPrefetcherProxy.cc
@@ -66,7 +66,7 @@ RubyPrefetcherProxy::RubyPrefetcherProxy(AbstractController* _parent,
         prefetcher->setParentInfo(
             cacheCntrl->params().system,
             cacheCntrl->getProbeManager(),
-            RubySystem::getBlockSizeBytes());
+            cacheCntrl->m_ruby_system->getBlockSizeBytes());
     }
 }
 
@@ -112,7 +112,7 @@ RubyPrefetcherProxy::issuePrefetch()
 
         if (pkt) {
             DPRINTF(HWPrefetch, "Next prefetch ready %s\n", pkt->print());
-            unsigned blk_size = RubySystem::getBlockSizeBytes();
+            unsigned blk_size = cacheCntrl->m_ruby_system->getBlockSizeBytes();
             Addr line_addr = pkt->getBlockAddr(blk_size);
 
             if (issuedPfPkts.count(line_addr) == 0) {
@@ -126,6 +126,8 @@ RubyPrefetcherProxy::issuePrefetch()
 
                 std::shared_ptr<RubyRequest> msg =
                     std::make_shared<RubyRequest>(cacheCntrl->clockEdge(),
+                                                  blk_size,
+                                                  cacheCntrl->m_ruby_system,
                                                   pkt->getAddr(),
                                                   blk_size,
                                                   0, // pc
@@ -136,7 +138,10 @@ RubyPrefetcherProxy::issuePrefetch()
 
                 // enqueue request into prefetch queue to the cache
                 pfQueue->enqueue(msg, cacheCntrl->clockEdge(),
-                                    cacheCntrl->cyclesToTicks(Cycles(1)));
+                                 cacheCntrl->cyclesToTicks(Cycles(1)),
+                                 cacheCntrl->m_ruby_system->getRandomization(),
+                                 cacheCntrl->m_ruby_system->getWarmupEnabled()
+                                );
 
                 // track all pending PF requests
                 issuedPfPkts[line_addr] = pkt;
@@ -230,5 +235,19 @@ RubyPrefetcherProxy::regProbePoints()
             cacheCntrl->getProbeManager(), "Data Update");
 }
 
+Addr
+RubyPrefetcherProxy::makeLineAddress(Addr addr) const
+{
+    return ruby::makeLineAddress(addr,
+                           cacheCntrl->m_ruby_system->getBlockSizeBits());
+}
+
+Addr
+RubyPrefetcherProxy::getOffset(Addr addr) const
+{
+    return ruby::getOffset(addr,
+                           cacheCntrl->m_ruby_system->getBlockSizeBits());
+}
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/structures/RubyPrefetcherProxy.hh b/src/mem/ruby/structures/RubyPrefetcherProxy.hh
index 34c40154b6..e7c044edf8 100644
--- a/src/mem/ruby/structures/RubyPrefetcherProxy.hh
+++ b/src/mem/ruby/structures/RubyPrefetcherProxy.hh
@@ -142,6 +142,9 @@ class RubyPrefetcherProxy : public CacheAccessor, public Named
      */
     ProbePointArg<CacheDataUpdateProbeArg> *ppDataUpdate;
 
+    Addr makeLineAddress(Addr addr) const;
+    Addr getOffset(Addr addr) const;
+
   public:
 
     /** Accessor functions */
diff --git a/src/mem/ruby/structures/TBETable.hh b/src/mem/ruby/structures/TBETable.hh
index 9030d52d9f..72770ce42f 100644
--- a/src/mem/ruby/structures/TBETable.hh
+++ b/src/mem/ruby/structures/TBETable.hh
@@ -70,6 +70,8 @@ class TBETable
         return (m_number_of_TBEs - m_map.size()) >= n;
     }
 
+    void setBlockSize(const int block_size) { m_block_size = block_size; }
+
     ENTRY *getNullEntry();
     ENTRY *lookup(Addr address);
 
@@ -85,7 +87,8 @@ class TBETable
     std::unordered_map<Addr, ENTRY> m_map;
 
   private:
-    int m_number_of_TBEs;
+    int m_number_of_TBEs = 0;
+    int m_block_size = 0;
 };
 
 template<class ENTRY>
@@ -101,7 +104,7 @@ template<class ENTRY>
 inline bool
 TBETable<ENTRY>::isPresent(Addr address) const
 {
-    assert(address == makeLineAddress(address));
+    assert(address == makeLineAddress(address, floorLog2(m_block_size)));
     assert(m_map.size() <= m_number_of_TBEs);
     return !!m_map.count(address);
 }
@@ -112,7 +115,8 @@ TBETable<ENTRY>::allocate(Addr address)
 {
     assert(!isPresent(address));
     assert(m_map.size() < m_number_of_TBEs);
-    m_map[address] = ENTRY();
+    assert(m_block_size > 0);
+    m_map.emplace(address, ENTRY(m_block_size));
 }
 
 template<class ENTRY>
diff --git a/src/mem/ruby/structures/TimerTable.cc b/src/mem/ruby/structures/TimerTable.cc
index f8f24dbfc0..a9ce92252e 100644
--- a/src/mem/ruby/structures/TimerTable.cc
+++ b/src/mem/ruby/structures/TimerTable.cc
@@ -70,7 +70,7 @@ TimerTable::nextAddress() const
 void
 TimerTable::set(Addr address, Tick ready_time)
 {
-    assert(address == makeLineAddress(address));
+    assert(address == makeLineAddress(address, m_block_size_bits));
     assert(!m_map.count(address));
 
     m_map[address] = ready_time;
@@ -87,7 +87,7 @@ TimerTable::set(Addr address, Tick ready_time)
 void
 TimerTable::unset(Addr address)
 {
-    assert(address == makeLineAddress(address));
+    assert(address == makeLineAddress(address, m_block_size_bits));
     assert(m_map.count(address));
     m_map.erase(address);
 
diff --git a/src/mem/ruby/structures/TimerTable.hh b/src/mem/ruby/structures/TimerTable.hh
index e676359fd4..92c485ab57 100644
--- a/src/mem/ruby/structures/TimerTable.hh
+++ b/src/mem/ruby/structures/TimerTable.hh
@@ -48,6 +48,12 @@ class TimerTable
   public:
     TimerTable();
 
+    void
+    setBlockSize(int block_size)
+    {
+        m_block_size_bits = floorLog2(block_size);
+    }
+
     void
     setConsumer(Consumer* consumer_ptr)
     {
@@ -88,6 +94,8 @@ class TimerTable
     //! Consumer to signal a wakeup()
     Consumer* m_consumer_ptr;
 
+    int m_block_size_bits = 0;
+
     std::string m_name;
 };
 
diff --git a/src/mem/ruby/structures/WireBuffer.cc b/src/mem/ruby/structures/WireBuffer.cc
index a839fe7cc7..3ebbe2a305 100644
--- a/src/mem/ruby/structures/WireBuffer.cc
+++ b/src/mem/ruby/structures/WireBuffer.cc
@@ -36,7 +36,6 @@
 
 #include "base/cprintf.hh"
 #include "base/stl_helpers.hh"
-#include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
 {
@@ -74,7 +73,8 @@ WireBuffer::~WireBuffer()
 }
 
 void
-WireBuffer::enqueue(MsgPtr message, Tick current_time, Tick delta)
+WireBuffer::enqueue(MsgPtr message, Tick current_time, Tick delta,
+                    bool /*ruby_is_random*/, bool /*ruby_warmup*/)
 {
     m_msg_counter++;
     Tick arrival_time = current_time + delta;
diff --git a/src/mem/ruby/structures/WireBuffer.hh b/src/mem/ruby/structures/WireBuffer.hh
index b26043b09a..75dfc154c8 100644
--- a/src/mem/ruby/structures/WireBuffer.hh
+++ b/src/mem/ruby/structures/WireBuffer.hh
@@ -78,7 +78,10 @@ class WireBuffer : public SimObject
     void setDescription(const std::string& name) { m_description = name; };
     std::string getDescription() { return m_description; };
 
-    void enqueue(MsgPtr message, Tick current_time, Tick delta);
+    // ruby_is_random and ruby_warmup are not used, but this method signature
+    // must match that of MessageBuffer.
+    void enqueue(MsgPtr message, Tick current_time, Tick delta,
+                 bool ruby_is_random = false, bool ruby_warmup = false);
     void dequeue(Tick current_time);
     const Message* peek();
     void recycle(Tick current_time, Tick recycle_latency);
diff --git a/src/mem/ruby/structures/WireBuffer.py b/src/mem/ruby/structures/WireBuffer.py
index ca67e7cb31..8cb2cfe4d6 100644
--- a/src/mem/ruby/structures/WireBuffer.py
+++ b/src/mem/ruby/structures/WireBuffer.py
@@ -35,5 +35,3 @@ class RubyWireBuffer(SimObject):
     type = "RubyWireBuffer"
     cxx_class = "gem5::ruby::WireBuffer"
     cxx_header = "mem/ruby/structures/WireBuffer.hh"
-
-    ruby_system = Param.RubySystem(Parent.any, "")
diff --git a/src/mem/ruby/system/CacheRecorder.cc b/src/mem/ruby/system/CacheRecorder.cc
index 3326856849..426c604cb0 100644
--- a/src/mem/ruby/system/CacheRecorder.cc
+++ b/src/mem/ruby/system/CacheRecorder.cc
@@ -49,31 +49,25 @@ TraceRecord::print(std::ostream& out) const
         << m_type << ", Time: " << m_time << "]";
 }
 
-CacheRecorder::CacheRecorder()
-    : m_uncompressed_trace(NULL),
-      m_uncompressed_trace_size(0),
-      m_block_size_bytes(RubySystem::getBlockSizeBytes())
-{
-}
-
 CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace,
                              uint64_t uncompressed_trace_size,
                              std::vector<RubyPort*>& ruby_port_map,
-                             uint64_t block_size_bytes)
+                             uint64_t trace_block_size_bytes,
+                             uint64_t system_block_size_bytes)
     : m_uncompressed_trace(uncompressed_trace),
       m_uncompressed_trace_size(uncompressed_trace_size),
       m_ruby_port_map(ruby_port_map), m_bytes_read(0),
       m_records_read(0), m_records_flushed(0),
-      m_block_size_bytes(block_size_bytes)
+      m_block_size_bytes(trace_block_size_bytes)
 
 {
     if (m_uncompressed_trace != NULL) {
-        if (m_block_size_bytes < RubySystem::getBlockSizeBytes()) {
+        if (m_block_size_bytes < system_block_size_bytes) {
             // Block sizes larger than when the trace was recorded are not
             // supported, as we cannot reliably turn accesses to smaller blocks
             // into larger ones.
             panic("Recorded cache block size (%d) < current block size (%d) !!",
-                    m_block_size_bytes, RubySystem::getBlockSizeBytes());
+                    m_block_size_bytes, system_block_size_bytes);
         }
     }
 }
@@ -125,7 +119,7 @@ CacheRecorder::enqueueNextFetchRequest()
         DPRINTF(RubyCacheTrace, "Issuing %s\n", *traceRecord);
 
         for (int rec_bytes_read = 0; rec_bytes_read < m_block_size_bytes;
-                rec_bytes_read += RubySystem::getBlockSizeBytes()) {
+                rec_bytes_read += m_block_size_bytes) {
             RequestPtr req;
             MemCmd::Command requestType;
 
@@ -133,19 +127,19 @@ CacheRecorder::enqueueNextFetchRequest()
                 requestType = MemCmd::ReadReq;
                 req = std::make_shared<Request>(
                     traceRecord->m_data_address + rec_bytes_read,
-                    RubySystem::getBlockSizeBytes(), 0,
+                    m_block_size_bytes, 0,
                                     Request::funcRequestorId);
             }   else if (traceRecord->m_type == RubyRequestType_IFETCH) {
                 requestType = MemCmd::ReadReq;
                 req = std::make_shared<Request>(
                         traceRecord->m_data_address + rec_bytes_read,
-                        RubySystem::getBlockSizeBytes(),
+                        m_block_size_bytes,
                         Request::INST_FETCH, Request::funcRequestorId);
             }   else {
                 requestType = MemCmd::WriteReq;
                 req = std::make_shared<Request>(
                     traceRecord->m_data_address + rec_bytes_read,
-                    RubySystem::getBlockSizeBytes(), 0,
+                    m_block_size_bytes, 0,
                                 Request::funcRequestorId);
             }
 
diff --git a/src/mem/ruby/system/CacheRecorder.hh b/src/mem/ruby/system/CacheRecorder.hh
index 021da6a4da..982e8b0592 100644
--- a/src/mem/ruby/system/CacheRecorder.hh
+++ b/src/mem/ruby/system/CacheRecorder.hh
@@ -73,13 +73,15 @@ class TraceRecord
 class CacheRecorder
 {
   public:
-    CacheRecorder();
-    ~CacheRecorder();
-
+    // Construction requires block size.
+    CacheRecorder() = delete;
     CacheRecorder(uint8_t* uncompressed_trace,
                   uint64_t uncompressed_trace_size,
                   std::vector<RubyPort*>& ruby_port_map,
-                  uint64_t block_size_bytes);
+                  uint64_t trace_block_size_bytes,
+                  uint64_t system_block_size_bytes);
+    ~CacheRecorder();
+
     void addRecord(int cntrl, Addr data_addr, Addr pc_addr,
                    RubyRequestType type, Tick time, DataBlock& data);
 
diff --git a/src/mem/ruby/system/DMASequencer.cc b/src/mem/ruby/system/DMASequencer.cc
index aa3fc66814..cd9d62d12a 100644
--- a/src/mem/ruby/system/DMASequencer.cc
+++ b/src/mem/ruby/system/DMASequencer.cc
@@ -73,7 +73,7 @@ void
 DMASequencer::init()
 {
     RubyPort::init();
-    m_data_block_mask = mask(RubySystem::getBlockSizeBits());
+    m_data_block_mask = mask(m_ruby_system->getBlockSizeBits());
 }
 
 RequestStatus
@@ -110,8 +110,10 @@ DMASequencer::makeRequest(PacketPtr pkt)
 
     DPRINTF(RubyDma, "DMA req created: addr %p, len %d\n", line_addr, len);
 
+    int blk_size = m_ruby_system->getBlockSizeBytes();
+
     std::shared_ptr<SequencerMsg> msg =
-        std::make_shared<SequencerMsg>(clockEdge());
+        std::make_shared<SequencerMsg>(clockEdge(), blk_size, m_ruby_system);
     msg->getPhysicalAddress() = paddr;
     msg->getLineAddress() = line_addr;
 
@@ -145,8 +147,8 @@ DMASequencer::makeRequest(PacketPtr pkt)
 
     int offset = paddr & m_data_block_mask;
 
-    msg->getLen() = (offset + len) <= RubySystem::getBlockSizeBytes() ?
-        len : RubySystem::getBlockSizeBytes() - offset;
+    msg->getLen() = (offset + len) <= m_ruby_system->getBlockSizeBytes() ?
+        len : m_ruby_system->getBlockSizeBytes() - offset;
 
     if (write && (data != NULL)) {
         if (active_request.data != NULL) {
@@ -157,7 +159,8 @@ DMASequencer::makeRequest(PacketPtr pkt)
     m_outstanding_count++;
 
     assert(m_mandatory_q_ptr != NULL);
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1)));
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1)),
+        m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());
     active_request.bytes_issued += msg->getLen();
 
     return RequestStatus_Issued;
@@ -183,8 +186,10 @@ DMASequencer::issueNext(const Addr& address)
         return;
     }
 
+    int blk_size = m_ruby_system->getBlockSizeBytes();
+
     std::shared_ptr<SequencerMsg> msg =
-        std::make_shared<SequencerMsg>(clockEdge());
+        std::make_shared<SequencerMsg>(clockEdge(), blk_size, m_ruby_system);
     msg->getPhysicalAddress() = active_request.start_paddr +
                                 active_request.bytes_completed;
 
@@ -196,9 +201,9 @@ DMASequencer::issueNext(const Addr& address)
 
     msg->getLen() =
         (active_request.len -
-         active_request.bytes_completed < RubySystem::getBlockSizeBytes() ?
+         active_request.bytes_completed < m_ruby_system->getBlockSizeBytes() ?
          active_request.len - active_request.bytes_completed :
-         RubySystem::getBlockSizeBytes());
+         m_ruby_system->getBlockSizeBytes());
 
     if (active_request.write) {
         msg->getDataBlk().
@@ -207,7 +212,8 @@ DMASequencer::issueNext(const Addr& address)
     }
 
     assert(m_mandatory_q_ptr != NULL);
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1)));
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1)),
+        m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());
     active_request.bytes_issued += msg->getLen();
     DPRINTF(RubyDma,
             "DMA request bytes issued %d, bytes completed %d, total len %d\n",
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index 072c63efd7..4d66dc6c1b 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -142,8 +142,8 @@ UncoalescedTable::updateResources()
             // are accessed directly using the makeRequest() command
             // instead of accessing through the port. This makes
             // sending tokens through the port unnecessary
-            if (!RubySystem::getWarmupEnabled()
-                    && !RubySystem::getCooldownEnabled()) {
+            if (!coalescer->getRubySystem()->getWarmupEnabled() &&
+                !coalescer->getRubySystem()->getCooldownEnabled()) {
                 if (reqTypeMap[seq_num] != RubyRequestType_FLUSH) {
                     DPRINTF(GPUCoalescer,
                             "Returning token seqNum %d\n", seq_num);
@@ -177,7 +177,7 @@ UncoalescedTable::printRequestTable(std::stringstream& ss)
     ss << "Listing pending packets from " << instMap.size() << " instructions";
 
     for (auto& inst : instMap) {
-        ss << "\tAddr: " << printAddress(inst.first) << " with "
+        ss << "\tAddr: " << coalescer->printAddress(inst.first) << " with "
            << inst.second.size() << " pending packets" << std::endl;
     }
 }
@@ -590,7 +590,7 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
         // When the Ruby system is cooldown phase, the requests come from
         // the cache recorder. These requests do not get coalesced and
         // do not return valid data.
-        if (RubySystem::getCooldownEnabled())
+        if (m_ruby_system->getCooldownEnabled())
             continue;
 
         if (pkt->getPtr<uint8_t>()) {
@@ -700,8 +700,8 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
         // When Ruby is in warmup or cooldown phase, the requests come from
         // the cache recorder. There is no dynamic instruction associated
         // with these requests either
-        if (!RubySystem::getWarmupEnabled()
-                && !RubySystem::getCooldownEnabled()) {
+        if (!m_ruby_system->getWarmupEnabled()
+                && !m_ruby_system->getCooldownEnabled()) {
             if (!m_usingRubyTester) {
                 num_packets = 0;
                 for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
@@ -985,8 +985,8 @@ GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
         // When Ruby is in warmup or cooldown phase, the requests come
         // from the cache recorder. They do not track which port to use
         // and do not need to send the response back
-        if (!RubySystem::getWarmupEnabled()
-                && !RubySystem::getCooldownEnabled()) {
+        if (!m_ruby_system->getWarmupEnabled()
+                && !m_ruby_system->getCooldownEnabled()) {
             RubyPort::SenderState *ss =
                 safe_cast<RubyPort::SenderState *>(pkt->senderState);
             MemResponsePort *port = ss->port;
@@ -1015,9 +1015,9 @@ GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
     }
 
     RubySystem *rs = m_ruby_system;
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         rs->m_cache_recorder->enqueueNextFetchRequest();
-    } else if (RubySystem::getCooldownEnabled()) {
+    } else if (m_ruby_system->getCooldownEnabled()) {
         rs->m_cache_recorder->enqueueNextFlushRequest();
     } else {
         testDrainComplete();
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index 42efe41cb7..08412baad1 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -341,6 +341,8 @@ class GPUCoalescer : public RubyPort
 
     void insertKernel(int wavefront_id, PacketPtr pkt);
 
+    RubySystem *getRubySystem() { return m_ruby_system; }
+
     GMTokenPort& getGMTokenPort() { return gmTokenPort; }
 
     statistics::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 2630a6a27c..127f3c7802 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -326,6 +326,8 @@ RubyPort::MemResponsePort::recvAtomic(PacketPtr pkt)
         panic("Ruby supports atomic accesses only in noncaching mode\n");
     }
 
+    RubySystem *rs = owner.m_ruby_system;
+
     // Check for pio requests and directly send them to the dedicated
     // pio port.
     if (pkt->cmd != MemCmd::MemSyncReq) {
@@ -343,12 +345,11 @@ RubyPort::MemResponsePort::recvAtomic(PacketPtr pkt)
             return owner.ticksToCycles(req_ticks);
         }
 
-        assert(getOffset(pkt->getAddr()) + pkt->getSize() <=
-               RubySystem::getBlockSizeBytes());
+        assert(owner.getOffset(pkt->getAddr()) + pkt->getSize() <=
+               rs->getBlockSizeBytes());
     }
 
     // Find the machine type of memory controller interface
-    RubySystem *rs = owner.m_ruby_system;
     static int mem_interface_type = -1;
     if (mem_interface_type == -1) {
         if (rs->m_abstract_controls[MachineType_Directory].size() != 0) {
@@ -404,7 +405,7 @@ RubyPort::MemResponsePort::recvFunctional(PacketPtr pkt)
     }
 
     assert(pkt->getAddr() + pkt->getSize() <=
-           makeLineAddress(pkt->getAddr()) + RubySystem::getBlockSizeBytes());
+           owner.makeLineAddress(pkt->getAddr()) + rs->getBlockSizeBytes());
 
     if (access_backing_store) {
         // The attached physmem contains the official version of data.
@@ -501,7 +502,7 @@ RubyPort::ruby_stale_translation_callback(Addr txnId)
     // assumed they will not be modified or deleted by receivers.
     // TODO: should this really be using funcRequestorId?
     auto request = std::make_shared<Request>(
-        0, RubySystem::getBlockSizeBytes(), Request::TLBI_EXT_SYNC,
+        0, m_ruby_system->getBlockSizeBytes(), Request::TLBI_EXT_SYNC,
         Request::funcRequestorId);
     // Store the txnId in extraData instead of the address
     request->setExtraData(txnId);
@@ -701,7 +702,7 @@ RubyPort::ruby_eviction_callback(Addr address)
     // assumed they will not be modified or deleted by receivers.
     // TODO: should this really be using funcRequestorId?
     auto request = std::make_shared<Request>(
-        address, RubySystem::getBlockSizeBytes(), 0,
+        address, m_ruby_system->getBlockSizeBytes(), 0,
         Request::funcRequestorId);
 
     // Use a single packet to signal all snooping ports of the invalidation.
@@ -739,5 +740,23 @@ RubyPort::functionalWrite(Packet *func_pkt)
     return num_written;
 }
 
+Addr
+RubyPort::getOffset(Addr addr) const
+{
+    return ruby::getOffset(addr, m_ruby_system->getBlockSizeBits());
+}
+
+Addr
+RubyPort::makeLineAddress(Addr addr) const
+{
+    return ruby::makeLineAddress(addr, m_ruby_system->getBlockSizeBits());
+}
+
+std::string
+RubyPort::printAddress(Addr addr) const
+{
+    return ruby::printAddress(addr, m_ruby_system->getBlockSizeBits());
+}
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh
index 66fe0a7686..39535930b3 100644
--- a/src/mem/ruby/system/RubyPort.hh
+++ b/src/mem/ruby/system/RubyPort.hh
@@ -181,6 +181,11 @@ class RubyPort : public ClockedObject
 
     virtual int functionalWrite(Packet *func_pkt);
 
+    // Helper methods for commonly used functions called in common/address.hh
+    Addr getOffset(Addr addr) const;
+    Addr makeLineAddress(Addr addr) const;
+    std::string printAddress(Addr addr) const;
+
   protected:
     void trySendRetries();
     void ruby_hit_callback(PacketPtr pkt);
diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc
index 21062eac14..fd7b262cb1 100644
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -66,15 +66,8 @@ namespace gem5
 namespace ruby
 {
 
-bool RubySystem::m_randomization;
-uint32_t RubySystem::m_block_size_bytes;
-uint32_t RubySystem::m_block_size_bits;
-uint32_t RubySystem::m_memory_size_bits;
-bool RubySystem::m_warmup_enabled = false;
 // To look forward to allowing multiple RubySystem instances, track the number
 // of RubySystems that need to be warmed up on checkpoint restore.
-unsigned RubySystem::m_systems_to_warmup = 0;
-bool RubySystem::m_cooldown_enabled = false;
 
 RubySystem::RubySystem(const Params &p)
     : ClockedObject(p), m_access_backing_store(p.access_backing_store),
@@ -212,8 +205,8 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
 
     // Create the CacheRecorder and record the cache trace
     m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
-                                         ruby_port_map,
-                                         block_size_bytes);
+                                         ruby_port_map, block_size_bytes,
+                                         m_block_size_bytes);
 }
 
 void
@@ -331,7 +324,7 @@ RubySystem::serialize(CheckpointOut &cp) const
     // Store the cache-block size, so we are able to restore on systems
     // with a different cache-block size. CacheRecorder depends on the
     // correct cache-block size upon unserializing.
-    uint64_t block_size_bytes = getBlockSizeBytes();
+    uint64_t block_size_bytes = m_block_size_bytes;
     SERIALIZE_SCALAR(block_size_bytes);
 
     // Check that there's a valid trace to use.  If not, then memory won't
@@ -416,7 +409,6 @@ RubySystem::unserialize(CheckpointIn &cp)
     readCompressedTrace(cache_trace_file, uncompressed_trace,
                         cache_trace_size);
     m_warmup_enabled = true;
-    m_systems_to_warmup++;
 
     // Create the cache recorder that will hang around until startup.
     makeCacheRecorder(uncompressed_trace, cache_trace_size, block_size_bytes);
@@ -467,10 +459,7 @@ RubySystem::startup()
 
         delete m_cache_recorder;
         m_cache_recorder = NULL;
-        m_systems_to_warmup--;
-        if (m_systems_to_warmup == 0) {
-            m_warmup_enabled = false;
-        }
+        m_warmup_enabled = false;
 
         // Restore eventq head
         eventq->replaceHead(eventq_head);
@@ -509,7 +498,7 @@ bool
 RubySystem::functionalRead(PacketPtr pkt)
 {
     Addr address(pkt->getAddr());
-    Addr line_address = makeLineAddress(address);
+    Addr line_address = makeLineAddress(address, m_block_size_bits);
 
     AccessPermission access_perm = AccessPermission_NotPresent;
 
@@ -625,7 +614,7 @@ bool
 RubySystem::functionalRead(PacketPtr pkt)
 {
     Addr address(pkt->getAddr());
-    Addr line_address = makeLineAddress(address);
+    Addr line_address = makeLineAddress(address, m_block_size_bits);
 
     DPRINTF(RubySystem, "Functional Read request for %#x\n", address);
 
@@ -726,7 +715,7 @@ bool
 RubySystem::functionalWrite(PacketPtr pkt)
 {
     Addr addr(pkt->getAddr());
-    Addr line_addr = makeLineAddress(addr);
+    Addr line_addr = makeLineAddress(addr, m_block_size_bits);
     AccessPermission access_perm = AccessPermission_NotPresent;
 
     DPRINTF(RubySystem, "Functional Write request for %#x\n", addr);
diff --git a/src/mem/ruby/system/RubySystem.hh b/src/mem/ruby/system/RubySystem.hh
index e16d699204..7e18770230 100644
--- a/src/mem/ruby/system/RubySystem.hh
+++ b/src/mem/ruby/system/RubySystem.hh
@@ -68,12 +68,12 @@ class RubySystem : public ClockedObject
     ~RubySystem();
 
     // config accessors
-    static int getRandomization() { return m_randomization; }
-    static uint32_t getBlockSizeBytes() { return m_block_size_bytes; }
-    static uint32_t getBlockSizeBits() { return m_block_size_bits; }
-    static uint32_t getMemorySizeBits() { return m_memory_size_bits; }
-    static bool getWarmupEnabled() { return m_warmup_enabled; }
-    static bool getCooldownEnabled() { return m_cooldown_enabled; }
+    int getRandomization() { return m_randomization; }
+    uint32_t getBlockSizeBytes() { return m_block_size_bytes; }
+    uint32_t getBlockSizeBits() { return m_block_size_bits; }
+    uint32_t getMemorySizeBits() { return m_memory_size_bits; }
+    bool getWarmupEnabled() { return m_warmup_enabled; }
+    bool getCooldownEnabled() { return m_cooldown_enabled; }
 
     memory::SimpleMemory *getPhysMem() { return m_phys_mem; }
     Cycles getStartCycle() { return m_start_cycle; }
@@ -134,14 +134,13 @@ class RubySystem : public ClockedObject
     void processRubyEvent();
   private:
     // configuration parameters
-    static bool m_randomization;
-    static uint32_t m_block_size_bytes;
-    static uint32_t m_block_size_bits;
-    static uint32_t m_memory_size_bits;
+    bool m_randomization;
+    uint32_t m_block_size_bytes;
+    uint32_t m_block_size_bits;
+    uint32_t m_memory_size_bits;
 
-    static bool m_warmup_enabled;
-    static unsigned m_systems_to_warmup;
-    static bool m_cooldown_enabled;
+    bool m_warmup_enabled = false;
+    bool m_cooldown_enabled = false;
     memory::SimpleMemory *m_phys_mem;
     const bool m_access_backing_store;
 
@@ -158,6 +157,11 @@ class RubySystem : public ClockedObject
     Profiler* m_profiler;
     CacheRecorder* m_cache_recorder;
     std::vector<std::map<uint32_t, AbstractController *> > m_abstract_controls;
+    std::map<MachineType, uint32_t> m_num_controllers;
+
+    // These are auto-generated by SLICC based on the built protocol.
+    int MachineType_base_count(const MachineType& obj);
+    int MachineType_base_number(const MachineType& obj);
 };
 
 } // namespace ruby
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 4b0c6a239c..e2f49f5dff 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -73,6 +73,8 @@ Sequencer::Sequencer(const Params &p)
 {
     m_outstanding_count = 0;
 
+    m_ruby_system = p.ruby_system;
+
     m_dataCache_ptr = p.dcache;
     m_max_outstanding_requests = p.max_outstanding_requests;
     m_deadlock_threshold = p.deadlock_threshold;
@@ -726,7 +728,7 @@ Sequencer::hitCallback(SequencerRequest* srequest, DataBlock& data,
                          printAddress(request_address));
 
     // update the data unless it is a non-data-carrying flush
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         data.setData(pkt);
     } else if (!pkt->isFlush()) {
         if ((type == RubyRequestType_LD) ||
@@ -782,11 +784,11 @@ Sequencer::hitCallback(SequencerRequest* srequest, DataBlock& data,
     }
 
     RubySystem *rs = m_ruby_system;
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         assert(pkt->req);
         delete pkt;
         rs->m_cache_recorder->enqueueNextFetchRequest();
-    } else if (RubySystem::getCooldownEnabled()) {
+    } else if (m_ruby_system->getCooldownEnabled()) {
         delete pkt;
         rs->m_cache_recorder->enqueueNextFlushRequest();
     } else {
@@ -852,8 +854,8 @@ Sequencer::completeHitCallback(std::vector<PacketPtr> & mylist)
         // When Ruby is in warmup or cooldown phase, the requests come
         // from the cache recorder. They do not track which port to use
         // and do not need to send the response back
-        if (!RubySystem::getWarmupEnabled()
-                && !RubySystem::getCooldownEnabled()) {
+        if (!m_ruby_system->getWarmupEnabled()
+                && !m_ruby_system->getCooldownEnabled()) {
             RubyPort::SenderState *ss =
                 safe_cast<RubyPort::SenderState *>(pkt->senderState);
             MemResponsePort *port = ss->port;
@@ -873,9 +875,9 @@ Sequencer::completeHitCallback(std::vector<PacketPtr> & mylist)
     }
 
     RubySystem *rs = m_ruby_system;
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         rs->m_cache_recorder->enqueueNextFetchRequest();
-    } else if (RubySystem::getCooldownEnabled()) {
+    } else if (m_ruby_system->getCooldownEnabled()) {
         rs->m_cache_recorder->enqueueNextFlushRequest();
     } else {
         testDrainComplete();
@@ -910,14 +912,16 @@ Sequencer::invL1()
         // Evict Read-only data
         RubyRequestType request_type = RubyRequestType_REPLACEMENT;
         std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
-            clockEdge(), addr, 0, 0,
-            request_type, RubyAccessMode_Supervisor,
+            clockEdge(), m_ruby_system->getBlockSizeBytes(), m_ruby_system,
+            addr, 0, 0, request_type, RubyAccessMode_Supervisor,
             nullptr);
         DPRINTF(RubySequencer, "Evicting addr 0x%x\n", addr);
         assert(m_mandatory_q_ptr != NULL);
         Tick latency = cyclesToTicks(
             m_controller->mandatoryQueueLatency(request_type));
-        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency,
+                                   m_ruby_system->getRandomization(),
+                                   m_ruby_system->getWarmupEnabled());
         m_num_pending_invs++;
     }
     DPRINTF(RubySequencer,
@@ -1080,11 +1084,14 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
         pc = pkt->req->getPC();
     }
 
+    int blk_size = m_ruby_system->getBlockSizeBytes();
+
     // check if the packet has data as for example prefetch and flush
     // requests do not
     std::shared_ptr<RubyRequest> msg;
     if (pkt->req->isMemMgmt()) {
-        msg = std::make_shared<RubyRequest>(clockEdge(),
+        msg = std::make_shared<RubyRequest>(clockEdge(), blk_size,
+                                            m_ruby_system,
                                             pc, secondary_type,
                                             RubyAccessMode_Supervisor, pkt,
                                             proc_id, core_id);
@@ -1111,8 +1118,10 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
                     msg->m_tlbiTransactionUid);
         }
     } else {
-        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
-                                            pkt->getSize(), pc, secondary_type,
+        msg = std::make_shared<RubyRequest>(clockEdge(), blk_size,
+                                            m_ruby_system,
+                                            pkt->getAddr(), pkt->getSize(),
+                                            pc, secondary_type,
                                             RubyAccessMode_Supervisor, pkt,
                                             PrefetchBit_No, proc_id, core_id);
 
@@ -1147,7 +1156,9 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
     assert(latency > 0);
 
     assert(m_mandatory_q_ptr != NULL);
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency,
+                               m_ruby_system->getRandomization(),
+                               m_ruby_system->getWarmupEnabled());
 }
 
 template <class KEY, class VALUE>
@@ -1194,7 +1205,7 @@ Sequencer::incrementUnaddressedTransactionCnt()
     // Limit m_unaddressedTransactionCnt to 32 bits,
     // top 32 bits should always be zeroed out
     uint64_t aligned_txid = \
-        m_unaddressedTransactionCnt << RubySystem::getBlockSizeBits();
+        m_unaddressedTransactionCnt << m_ruby_system->getBlockSizeBits();
 
     if (aligned_txid > 0xFFFFFFFFull) {
         m_unaddressedTransactionCnt = 0;
@@ -1206,7 +1217,7 @@ Sequencer::getCurrentUnaddressedTransactionID() const
 {
     return (
         uint64_t(m_version & 0xFFFFFFFF) << 32) |
-        (m_unaddressedTransactionCnt << RubySystem::getBlockSizeBits()
+        (m_unaddressedTransactionCnt << m_ruby_system->getBlockSizeBits()
     );
 }
 
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index 1f60d2638f..ee16d2fe2e 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -254,6 +254,8 @@ class Sequencer : public RubyPort
                                         RubyRequestType primary_type,
                                         RubyRequestType secondary_type);
 
+    RubySystem *m_ruby_system;
+
   private:
     int m_max_outstanding_requests;
 
diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py
index 3f570fb952..0994bb4afe 100644
--- a/src/mem/ruby/system/Sequencer.py
+++ b/src/mem/ruby/system/Sequencer.py
@@ -83,7 +83,7 @@ class RubyPort(ClockedObject):
 
     using_ruby_tester = Param.Bool(False, "")
     no_retry_on_stall = Param.Bool(False, "")
-    ruby_system = Param.RubySystem(Parent.any, "")
+    ruby_system = Param.RubySystem("Parent RubySystem object")
     system = Param.System(Parent.any, "system object")
     support_data_reqs = Param.Bool(True, "data cache requests supported")
     support_inst_reqs = Param.Bool(True, "inst cache requests supported")
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
index 47ceced3a7..67dd88fb2e 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -135,9 +135,9 @@ VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
     // Creating WriteMask that records written bytes
     // and atomic operations. This enables partial writes
     // and partial reads of those writes
-    DataBlock dataBlock;
+    uint32_t blockSize = m_ruby_system->getBlockSizeBytes();
+    DataBlock dataBlock(blockSize);
     dataBlock.clear();
-    uint32_t blockSize = RubySystem::getBlockSizeBytes();
     std::vector<bool> accessMask(blockSize,false);
     std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
     uint32_t tableSize = crequest->getPackets().size();
@@ -159,15 +159,17 @@ VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
     }
     std::shared_ptr<RubyRequest> msg;
     if (pkt->isAtomicOp()) {
-        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
-                              pkt->getSize(), pc, crequest->getRubyType(),
+        msg = std::make_shared<RubyRequest>(clockEdge(), blockSize,
+                              m_ruby_system, pkt->getAddr(), pkt->getSize(),
+                              pc, crequest->getRubyType(),
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
                               dataBlock, atomicOps, crequest->getSeqNum());
     } else {
-        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
-                              pkt->getSize(), pc, crequest->getRubyType(),
+        msg = std::make_shared<RubyRequest>(clockEdge(), blockSize,
+                              m_ruby_system, pkt->getAddr(), pkt->getSize(),
+                              pc, crequest->getRubyType(),
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
@@ -195,7 +197,9 @@ VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
     assert(m_mandatory_q_ptr);
     Tick latency = cyclesToTicks(
         m_controller->mandatoryQueueLatency(crequest->getRubyType()));
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency,
+                               m_ruby_system->getRandomization(),
+                               m_ruby_system->getWarmupEnabled());
 }
 
 void
@@ -241,7 +245,7 @@ VIPERCoalescer::writeCompleteCallback(Addr addr, uint64_t instSeqNum)
         std::remove_if(
             m_writeCompletePktMap[key].begin(),
             m_writeCompletePktMap[key].end(),
-            [addr](PacketPtr writeCompletePkt) -> bool {
+            [this,addr](PacketPtr writeCompletePkt) -> bool {
                 if (makeLineAddress(writeCompletePkt->getAddr()) == addr) {
                     RubyPort::SenderState *ss =
                         safe_cast<RubyPort::SenderState *>
@@ -296,14 +300,15 @@ VIPERCoalescer::invTCP()
         // Evict Read-only data
         RubyRequestType request_type = RubyRequestType_REPLACEMENT;
         std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
-            clockEdge(), addr, 0, 0,
-            request_type, RubyAccessMode_Supervisor,
-            nullptr);
+            clockEdge(), m_ruby_system->getBlockSizeBytes(), m_ruby_system,
+            addr, 0, 0, request_type, RubyAccessMode_Supervisor, nullptr);
         DPRINTF(GPUCoalescer, "Evicting addr 0x%x\n", addr);
         assert(m_mandatory_q_ptr != NULL);
         Tick latency = cyclesToTicks(
             m_controller->mandatoryQueueLatency(request_type));
-        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency,
+                                   m_ruby_system->getRandomization(),
+                                   m_ruby_system->getWarmupEnabled());
         m_num_pending_invs++;
     }
     DPRINTF(GPUCoalescer,
@@ -343,16 +348,17 @@ VIPERCoalescer::invTCC(PacketPtr pkt)
     RubyRequestType request_type = RubyRequestType_InvL2;
 
     std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
-        clockEdge(), addr, 0, 0,
-        request_type, RubyAccessMode_Supervisor,
-        nullptr);
+        clockEdge(), m_ruby_system->getBlockSizeBytes(), m_ruby_system,
+        addr, 0, 0, request_type, RubyAccessMode_Supervisor, nullptr);
 
     DPRINTF(GPUCoalescer, "Sending L2 invalidate to 0x%x\n", addr);
 
     assert(m_mandatory_q_ptr);
     Tick latency = cyclesToTicks(
         m_controller->mandatoryQueueLatency(request_type));
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency,
+                               m_ruby_system->getRandomization(),
+                               m_ruby_system->getWarmupEnabled());
 
     m_pending_invl2s[addr].push_back(pkt);
 }
diff --git a/src/mem/ruby/system/VIPERSequencer.cc b/src/mem/ruby/system/VIPERSequencer.cc
index ac840777d4..b8b806aa9c 100644
--- a/src/mem/ruby/system/VIPERSequencer.cc
+++ b/src/mem/ruby/system/VIPERSequencer.cc
@@ -81,8 +81,8 @@ VIPERSequencer::hitCallback(SequencerRequest* srequest, DataBlock& data,
     // subBlock with the recieved data.  The tester will later access
     // this state.
     assert(!m_usingRubyTester);
-    assert(!RubySystem::getWarmupEnabled());
-    assert(!RubySystem::getCooldownEnabled());
+    assert(!m_ruby_system->getWarmupEnabled());
+    assert(!m_ruby_system->getCooldownEnabled());
     ruby_hit_callback(pkt);
     testDrainComplete();
 }
diff --git a/src/mem/slicc/ast/CheckProbeStatementAST.py b/src/mem/slicc/ast/CheckProbeStatementAST.py
index 10945cfc30..14f6f7e4fa 100644
--- a/src/mem/slicc/ast/CheckProbeStatementAST.py
+++ b/src/mem/slicc/ast/CheckProbeStatementAST.py
@@ -49,7 +49,8 @@ class CheckProbeStatementAST(StatementAST):
     if (m_is_blocking &&
         (m_block_map.count($address_code) == 1) &&
         (m_block_map[$address_code] == &$in_port_code)) {
-            $in_port_code.delayHead(clockEdge(), cyclesToTicks(Cycles(1)));
+            $in_port_code.delayHead(clockEdge(), cyclesToTicks(Cycles(1)),
+                m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());
             continue;
         }
         """
diff --git a/src/mem/slicc/ast/DeferEnqueueingStatementAST.py b/src/mem/slicc/ast/DeferEnqueueingStatementAST.py
index 14b2e48cd3..4bb446aee2 100644
--- a/src/mem/slicc/ast/DeferEnqueueingStatementAST.py
+++ b/src/mem/slicc/ast/DeferEnqueueingStatementAST.py
@@ -68,7 +68,8 @@ class DeferEnqueueingStatementAST(StatementAST):
         # Declare message
         code(
             "std::shared_ptr<${{msg_type.c_ident}}> out_msg = "
-            "std::make_shared<${{msg_type.c_ident}}>(clockEdge());"
+            "std::make_shared<${{msg_type.c_ident}}>(clockEdge(),"
+            "    m_ruby_system->getBlockSizeBytes(), m_ruby_system);"
         )
 
         # The other statements
diff --git a/src/mem/slicc/ast/EnqueueStatementAST.py b/src/mem/slicc/ast/EnqueueStatementAST.py
index c2d47af9ce..b026f6e7a9 100644
--- a/src/mem/slicc/ast/EnqueueStatementAST.py
+++ b/src/mem/slicc/ast/EnqueueStatementAST.py
@@ -76,7 +76,8 @@ class EnqueueStatementAST(StatementAST):
         # Declare message
         code(
             "std::shared_ptr<${{msg_type.c_ident}}> out_msg = "
-            "std::make_shared<${{msg_type.c_ident}}>(clockEdge());"
+            "std::make_shared<${{msg_type.c_ident}}>(clockEdge(), "
+            "    m_ruby_system->getBlockSizeBytes(), m_ruby_system);"
         )
 
         # The other statements
@@ -89,17 +90,21 @@ class EnqueueStatementAST(StatementAST):
                 bypass_strict_fifo_code = self.bypass_strict_fifo.inline(False)
                 code(
                     "(${{self.queue_name.var.code}}).enqueue("
-                    "out_msg, clockEdge(), cyclesToTicks(Cycles($rcode)), $bypass_strict_fifo_code);"
+                    "out_msg, clockEdge(), cyclesToTicks(Cycles($rcode)), "
+                    "m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled(), "
+                    "$bypass_strict_fifo_code);"
                 )
             else:
                 code(
                     "(${{self.queue_name.var.code}}).enqueue("
-                    "out_msg, clockEdge(), cyclesToTicks(Cycles($rcode)));"
+                    "out_msg, clockEdge(), cyclesToTicks(Cycles($rcode)), "
+                    "m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());"
                 )
         else:
             code(
                 "(${{self.queue_name.var.code}}).enqueue(out_msg, "
-                "clockEdge(), cyclesToTicks(Cycles(1)));"
+                "clockEdge(), cyclesToTicks(Cycles(1)),"
+                "m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());"
             )
 
         # End scope
diff --git a/src/mem/slicc/ast/LocalVariableAST.py b/src/mem/slicc/ast/LocalVariableAST.py
index b4ac8f446b..43ab110a67 100644
--- a/src/mem/slicc/ast/LocalVariableAST.py
+++ b/src/mem/slicc/ast/LocalVariableAST.py
@@ -73,6 +73,8 @@ class LocalVariableAST(StatementAST):
             )
         ):
             code += f"{type.c_ident}* {ident}"
+        elif "implicit_ctor" in type:
+            code += f"{type.c_ident} {ident}({type['implicit_ctor']})"
         else:
             code += f"{type.c_ident} {ident}"
         return type
diff --git a/src/mem/slicc/ast/PeekStatementAST.py b/src/mem/slicc/ast/PeekStatementAST.py
index 00edff4e7b..415f4ec465 100644
--- a/src/mem/slicc/ast/PeekStatementAST.py
+++ b/src/mem/slicc/ast/PeekStatementAST.py
@@ -93,7 +93,8 @@ class PeekStatementAST(StatementAST):
     if (m_is_blocking &&
         (m_block_map.count(in_msg_ptr->m_$address_field) == 1) &&
         (m_block_map[in_msg_ptr->m_$address_field] != &$qcode)) {
-            $qcode.delayHead(clockEdge(), cyclesToTicks(Cycles(1)));
+            $qcode.delayHead(clockEdge(), cyclesToTicks(Cycles(1)),
+            m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());
             continue;
     }
             """
diff --git a/src/mem/slicc/symbols/StateMachine.py b/src/mem/slicc/symbols/StateMachine.py
index b523522501..6202d2d239 100644
--- a/src/mem/slicc/symbols/StateMachine.py
+++ b/src/mem/slicc/symbols/StateMachine.py
@@ -352,7 +352,6 @@ class $c_ident : public AbstractController
   public:
     typedef ${c_ident}Params Params;
     $c_ident(const Params &p);
-    static int getNumControllers();
     void init();
 
     MessageBuffer *getMandatoryQueue() const;
@@ -449,9 +448,8 @@ int m_counters[${ident}_State_NUM][${ident}_Event_NUM];
 int m_event_counters[${ident}_Event_NUM];
 bool m_possible[${ident}_State_NUM][${ident}_Event_NUM];
 
-static std::vector<statistics::Vector *> eventVec;
-static std::vector<std::vector<statistics::Vector *> > transVec;
-static int m_num_controllers;
+std::vector<statistics::Vector *> eventVec;
+std::vector<std::vector<statistics::Vector *> > transVec;
 
 // Internal functions
 """
@@ -625,10 +623,6 @@ namespace gem5
 namespace ruby
 {
 
-int $c_ident::m_num_controllers = 0;
-std::vector<statistics::Vector *>  $c_ident::eventVec;
-std::vector<std::vector<statistics::Vector *> >  $c_ident::transVec;
-
 // for adding information to the protocol debug trace
 std::stringstream ${ident}_transitionComment;
 
@@ -644,8 +638,9 @@ $c_ident::$c_ident(const Params &p)
 {
     m_machineID.type = MachineType_${ident};
     m_machineID.num = m_version;
-    m_num_controllers++;
+    p.ruby_system->m_num_controllers[MachineType_${ident}]++;
     p.ruby_system->registerAbstractController(this);
+    m_ruby_system = p.ruby_system;
 
     m_in_ports = $num_in_ports;
 """
@@ -699,7 +694,7 @@ void
 $c_ident::initNetQueues()
 {
     MachineType machine_type = string_to_MachineType("${{self.ident}}");
-    [[maybe_unused]] int base = MachineType_base_number(machine_type);
+    [[maybe_unused]] int base = m_ruby_system->MachineType_base_number(machine_type);
 
 """
         )
@@ -776,6 +771,17 @@ $c_ident::init()
                         comment = f"Type {vtype.ident} default"
                         code('*$vid = ${{vtype["default"]}}; // $comment')
 
+                    # For objects that require knowing the cache line size,
+                    # set the value here.
+                    if vtype.c_ident in ("TBETable"):
+                        block_size_func = "m_ruby_system->getBlockSizeBytes()"
+                        code(f"(*{vid}).setBlockSize({block_size_func});")
+
+        for param in self.config_parameters:
+            if param.type_ast.type.ident == "CacheMemory":
+                assert param.pointer
+                code(f"m_{param.ident}_ptr->setRubySystem(m_ruby_system);")
+
         # Set the prefetchers
         code()
         for prefetcher in self.prefetchers:
@@ -942,7 +948,9 @@ $c_ident::regStats()
                 "${c_ident}." + ${ident}_Event_to_string(event);
             statistics::Vector *t =
                 new statistics::Vector(profilerStatsPtr, stat_name.c_str());
-            t->init(m_num_controllers);
+            int num_controllers =
+                m_ruby_system->m_num_controllers[MachineType_${ident}];
+            t->init(num_controllers);
             t->flags(statistics::pdf | statistics::total |
                 statistics::oneline | statistics::nozero);
 
@@ -961,7 +969,9 @@ $c_ident::regStats()
                     "." + ${ident}_Event_to_string(event);
                 statistics::Vector *t = new statistics::Vector(
                     profilerStatsPtr, stat_name.c_str());
-                t->init(m_num_controllers);
+                int num_controllers =
+                    m_ruby_system->m_num_controllers[MachineType_${ident}];
+                t->init(num_controllers);
                 t->flags(statistics::pdf | statistics::total |
                     statistics::oneline | statistics::nozero);
                 transVec[state].push_back(t);
@@ -1062,9 +1072,12 @@ $c_ident::regStats()
 void
 $c_ident::collateStats()
 {
+    int num_controllers =
+        m_ruby_system->m_num_controllers[MachineType_${ident}];
+
     for (${ident}_Event event = ${ident}_Event_FIRST;
          event < ${ident}_Event_NUM; ++event) {
-        for (unsigned int i = 0; i < m_num_controllers; ++i) {
+        for (unsigned int i = 0; i < num_controllers; ++i) {
             RubySystem *rs = params().ruby_system;
             std::map<uint32_t, AbstractController *>::iterator it =
                      rs->m_abstract_controls[MachineType_${ident}].find(i);
@@ -1080,7 +1093,7 @@ $c_ident::collateStats()
         for (${ident}_Event event = ${ident}_Event_FIRST;
              event < ${ident}_Event_NUM; ++event) {
 
-            for (unsigned int i = 0; i < m_num_controllers; ++i) {
+            for (unsigned int i = 0; i < num_controllers; ++i) {
                 RubySystem *rs = params().ruby_system;
                 std::map<uint32_t, AbstractController *>::iterator it =
                          rs->m_abstract_controls[MachineType_${ident}].find(i);
@@ -1125,12 +1138,6 @@ $c_ident::getTransitionCount(${ident}_State state,
     return m_counters[state][event];
 }
 
-int
-$c_ident::getNumControllers()
-{
-    return m_num_controllers;
-}
-
 MessageBuffer*
 $c_ident::getMandatoryQueue() const
 {
@@ -1181,6 +1188,7 @@ void
 $c_ident::set_cache_entry(${{self.EntryType.c_ident}}*& m_cache_entry_ptr, AbstractCacheEntry* m_new_cache_entry)
 {
   m_cache_entry_ptr = (${{self.EntryType.c_ident}}*)m_new_cache_entry;
+  m_cache_entry_ptr->setRubySystem(m_ruby_system);
 }
 
 void
@@ -1200,6 +1208,7 @@ void
 $c_ident::set_tbe(${{self.TBEType.c_ident}}*& m_tbe_ptr, ${{self.TBEType.c_ident}}* m_new_tbe)
 {
   m_tbe_ptr = m_new_tbe;
+  m_tbe_ptr->setRubySystem(m_ruby_system);
 }
 
 void
diff --git a/src/mem/slicc/symbols/Type.py b/src/mem/slicc/symbols/Type.py
index 535a4165b3..53c8ff877e 100644
--- a/src/mem/slicc/symbols/Type.py
+++ b/src/mem/slicc/symbols/Type.py
@@ -119,6 +119,10 @@ class Type(Symbol):
     def isMessage(self):
         return "message" in self
 
+    @property
+    def isTBE(self):
+        return "tbe" in self
+
     @property
     def isBuffer(self):
         return "buffer" in self
@@ -250,18 +254,54 @@ namespace gem5
 namespace ruby
 {
 
+class RubySystem;
+
 $klass ${{self.c_ident}}$parent
 {
   public:
-    ${{self.c_ident}}
 """,
             klass="class",
         )
 
         if self.isMessage:
-            code("(Tick curTime) : %s(curTime) {" % self["interface"])
+            code(
+                "${{self.c_ident}}(Tick curTime, int blockSize, RubySystem* rs) : %s(curTime, blockSize, rs)"
+                % self["interface"]
+            )
+
+            for dm in self.data_members.values():
+                if dm.real_c_type in ("DataBlock", "WriteMask"):
+                    code(f"\t\t, m_{dm.ident}(blockSize)")
+
+            code("{")
+        elif self.isTBE:
+            code("${{self.c_ident}}(int block_size)")
+
+            ctor_count = 0
+            for dm in self.data_members.values():
+                if dm.real_c_type in ("DataBlock", "WriteMask"):
+                    if ctor_count == 0:
+                        code("\t:")
+                    else:
+                        code("\t, ")
+                    code(f"\t\tm_{dm.ident}(block_size)")
+                    ctor_count += 1
+
+            code("{")
         else:
-            code("()\n\t\t{")
+            code("${{self.c_ident}}()")
+
+            ctor_count = 0
+            for dm in self.data_members.values():
+                if dm.real_c_type in ("DataBlock", "WriteMask"):
+                    if ctor_count == 0:
+                        code("\t:")
+                    else:
+                        code("\t, ")
+                    code(f"\t\tm_{dm.ident}(0)")
+                    ctor_count += 1
+
+            code("{")
 
         code.indent()
         if not self.isGlobal:
@@ -280,6 +320,12 @@ $klass ${{self.c_ident}}$parent
                     code(" // default value of $tid")
                 else:
                     code("// m_$ident has no default")
+
+                # These parts of Messages need RubySystem pointers. For things
+                # like Entry which only store NetDest, RubySystem is not needed.
+                if self.isMessage and dm.real_c_type == "NetDest":
+                    code("// m_$ident requires RubySystem")
+                    code("m_$ident.setRubySystem(rs);")
             code.dedent()
         code("}")
 
@@ -300,21 +346,45 @@ $klass ${{self.c_ident}}$parent
             params = ", ".join(params)
 
             if self.isMessage:
-                params = "const Tick curTime, " + params
+                params = (
+                    "const Tick curTime, const int blockSize, const RubySystem *rs, "
+                    + params
+                )
 
             code("${{self.c_ident}}($params)")
 
             # Call superclass constructor
             if "interface" in self:
                 if self.isMessage:
-                    code('    : ${{self["interface"]}}(curTime)')
+                    code(
+                        '    : ${{self["interface"]}}(curTime, blockSize, rs)'
+                    )
+
+                    for dm in self.data_members.values():
+                        if dm.real_c_type in ("DataBlock", "WriteMask"):
+                            code(f"\t\t, m_{dm.ident}(blockSize)")
                 else:
                     code('    : ${{self["interface"]}}()')
 
+                    for dm in self.data_members.values():
+                        if dm.real_c_type in ("DataBlock", "WriteMask"):
+                            code(f"\t\t, m_{dm.ident}(local_{dm.ident})")
+            else:
+                ctor_count = 0
+                for dm in self.data_members.values():
+                    if dm.real_c_type in ("DataBlock", "WriteMask"):
+                        if ctor_count == 0:
+                            code("\t:")
+                        else:
+                            code("\t, ")
+                        code(f"\t\tm_{dm.ident}(local_{dm.ident})")
+                        ctor_count += 1
+
             code("{")
             code.indent()
             for dm in self.data_members.values():
-                code("m_${{dm.ident}} = local_${{dm.ident}};")
+                if not dm.real_c_type in ("DataBlock", "WriteMask"):
+                    code("m_${{dm.ident}} = local_${{dm.ident}};")
 
             code.dedent()
             code("}")
@@ -342,6 +412,35 @@ clone() const
             )
 
         if not self.isGlobal:
+            # Block size setter for fields that require block size
+            # Intentionally do not begin function name with "set" in case
+            # the user has a field named BlockSize which would conflict
+            # with the method generated below.
+            code("\nvoid initBlockSize(int block_size)")
+            code("{")
+            code("\tblock_size_bits = floorLog2(block_size);")
+
+            needs_block_size = (
+                "DataBlock",
+                "WriteMask",
+                "PersistentTable",
+                "TimerTable",
+                "PerfectCacheMemory",
+            )
+
+            for dm in self.data_members.values():
+                if dm.real_c_type in needs_block_size:
+                    code(f"\tm_{dm.ident}.setBlockSize(block_size);")
+            code("}\n")
+
+            code("\nvoid setRubySystem(RubySystem *ruby_system)")
+            code("{")
+            for dm in self.data_members.values():
+                if dm.real_c_type in ("NetDest"):
+                    code(f"// m_{dm.ident} requires RubySystem")
+                    code(f"\tm_{dm.ident}.setRubySystem(ruby_system);")
+            code("}\n")
+
             # const Get methods for each field
             code("// Const accessors methods for each field")
             for dm in self.data_members.values():
@@ -393,6 +492,9 @@ set${{dm.ident}}(const ${{dm.real_c_type}}& local_${{dm.ident}})
         code("  //private:")
         code.indent()
 
+        # block_size_bits for print methods
+        code("int block_size_bits = 0;")
+
         # Data members for each field
         for dm in self.data_members.values():
             if "abstract" not in dm:
@@ -473,7 +575,7 @@ ${{self.c_ident}}::print(std::ostream& out) const
             if dm.type.c_ident == "Addr":
                 code(
                     """
-out << "${{dm.ident}} = " << printAddress(m_${{dm.ident}}) << " ";"""
+out << "${{dm.ident}} = " << printAddress(m_${{dm.ident}}, block_size_bits) << " ";"""
                 )
             else:
                 code('out << "${{dm.ident}} = " << m_${{dm.ident}} << " ";' "")
@@ -846,7 +948,7 @@ ${{self.c_ident}}_from_base_level(int type)
  * \\return the base number of components for each machine
  */
 int
-${{self.c_ident}}_base_number(const ${{self.c_ident}}& obj)
+RubySystem::${{self.c_ident}}_base_number(const ${{self.c_ident}}& obj)
 {
     int base = 0;
     switch(obj) {
@@ -860,7 +962,7 @@ ${{self.c_ident}}_base_number(const ${{self.c_ident}}& obj)
                 # Check if there is a defined machine with this type
                 if enum.primary:
                     code(
-                        "    base += ${{enum.ident}}_Controller::getNumControllers();"
+                        "\tbase += m_num_controllers[${{self.c_ident}}_${{enum.ident}}];"
                     )
                 else:
                     code("    base += 0;")
@@ -882,7 +984,7 @@ ${{self.c_ident}}_base_number(const ${{self.c_ident}}& obj)
  * \\return the total number of components for each machine
  */
 int
-${{self.c_ident}}_base_count(const ${{self.c_ident}}& obj)
+RubySystem::${{self.c_ident}}_base_count(const ${{self.c_ident}}& obj)
 {
     switch(obj) {
 """
@@ -893,7 +995,7 @@ ${{self.c_ident}}_base_count(const ${{self.c_ident}}& obj)
                 code("case ${{self.c_ident}}_${{enum.ident}}:")
                 if enum.primary:
                     code(
-                        "return ${{enum.ident}}_Controller::getNumControllers();"
+                        "return m_num_controllers[${{self.c_ident}}_${{enum.ident}}];"
                     )
                 else:
                     code("return 0;")
diff --git a/src/python/gem5/components/cachehierarchies/chi/private_l1_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/chi/private_l1_cache_hierarchy.py
index 29df2a969c..a469fead61 100644
--- a/src/python/gem5/components/cachehierarchies/chi/private_l1_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/chi/private_l1_cache_hierarchy.py
@@ -137,7 +137,9 @@ class PrivateL1CacheHierarchy(AbstractRubyCacheHierarchy):
 
         # Set up a proxy port for the system_port. Used for load binaries and
         # other functional-only things.
-        self.ruby_system.sys_port_proxy = RubyPortProxy()
+        self.ruby_system.sys_port_proxy = RubyPortProxy(
+            ruby_system=self.ruby_system
+        )
         board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)
 
     def _create_core_cluster(
@@ -167,12 +169,16 @@ class PrivateL1CacheHierarchy(AbstractRubyCacheHierarchy):
         )
 
         cluster.icache.sequencer = RubySequencer(
-            version=core_num, dcache=NULL, clk_domain=cluster.icache.clk_domain
+            version=core_num,
+            dcache=NULL,
+            clk_domain=cluster.icache.clk_domain,
+            ruby_system=self.ruby_system,
         )
         cluster.dcache.sequencer = RubySequencer(
             version=core_num,
             dcache=cluster.dcache.cache,
             clk_domain=cluster.dcache.clk_domain,
+            ruby_system=self.ruby_system,
         )
 
         if board.has_io_bus():
@@ -223,7 +229,11 @@ class PrivateL1CacheHierarchy(AbstractRubyCacheHierarchy):
                 board.get_clock_domain(),
             )
             version = len(board.get_processor().get_cores()) + i
-            ctrl.sequencer = RubySequencer(version=version, in_ports=port)
+            ctrl.sequencer = RubySequencer(
+                version=version,
+                in_ports=port,
+                ruby_system=self.ruby_system,
+            )
             ctrl.sequencer.dcache = NULL
 
             ctrl.ruby_system = self.ruby_system
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/directory.py b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/directory.py
index 4840e3b264..d0c54840fc 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/directory.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/directory.py
@@ -37,7 +37,7 @@ class Directory(AbstractDirectory):
     def __init__(self, network, cache_line_size, mem_range, port):
         super().__init__(network, cache_line_size)
         self.addr_ranges = [mem_range]
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(block_size=cache_line_size)
         # Connect this directory to the memory side.
         self.memory_out_port = port
 
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l1_cache.py b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l1_cache.py
index 6d203f978a..ef90ac79f6 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l1_cache.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l1_cache.py
@@ -80,7 +80,7 @@ class L1Cache(L0Cache_Controller):
             replacement_policy=LRURP(),
         )
         self.clk_domain = clk_domain
-        self.prefetcher = RubyPrefetcher()
+        self.prefetcher = RubyPrefetcher(block_size=cache_line_size)
         self.send_evictions = core.requires_send_evicts()
         self.transitions_per_cycle = 32
         self.enable_prefetch = False
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l2_cache.py b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l2_cache.py
index ff2b8e3dd9..7c473f8be9 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l2_cache.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l2_cache.py
@@ -75,7 +75,7 @@ class L2Cache(L1Cache_Controller):
         self.l2_select_num_bits = int(math.log(num_l3Caches, 2))
         self.cluster_id = cluster_id
         self.clk_domain = clk_domain
-        self.prefetcher = RubyPrefetcher()
+        self.prefetcher = RubyPrefetcher(block_size=cache_line_size)
         self.transitions_per_cycle = 32
         # l1_request_latency, l1_response_latency, to_l2_latency are
         # ruby backend terminology.
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/directory.py b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/directory.py
index 4840e3b264..d0c54840fc 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/directory.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/directory.py
@@ -37,7 +37,7 @@ class Directory(AbstractDirectory):
     def __init__(self, network, cache_line_size, mem_range, port):
         super().__init__(network, cache_line_size)
         self.addr_ranges = [mem_range]
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(block_size=cache_line_size)
         # Connect this directory to the memory side.
         self.memory_out_port = port
 
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/l1_cache.py b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/l1_cache.py
index 7787644c9b..13625beea7 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/l1_cache.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/l1_cache.py
@@ -73,7 +73,7 @@ class L1Cache(AbstractL1Cache):
         )
         self.l2_select_num_bits = int(math.log(num_l2Caches, 2))
         self.clk_domain = clk_domain
-        self.prefetcher = RubyPrefetcher()
+        self.prefetcher = RubyPrefetcher(block_size=cache_line_size)
         self.send_evictions = core.requires_send_evicts()
         self.transitions_per_cycle = 4
         self.enable_prefetch = False
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/mi_example/directory.py b/src/python/gem5/components/cachehierarchies/ruby/caches/mi_example/directory.py
index 3d1ae54104..79e40e9e01 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/mi_example/directory.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/mi_example/directory.py
@@ -41,7 +41,7 @@ class Directory(AbstractDirectory):
     def __init__(self, network, cache_line_size, mem_range, port):
         super().__init__(network, cache_line_size)
         self.addr_ranges = [mem_range]
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(block_size=cache_line_size)
         # Connect this directory to the memory side.
         self.memory_out_port = port
 
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/core_complex.py b/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/core_complex.py
index 9aa0dc4a36..212c06c4c3 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/core_complex.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/core_complex.py
@@ -143,6 +143,7 @@ class CoreComplex(SubSystem, RubyNetworkComponent):
             version=core_id,
             dcache=cluster.l1_cache.Dcache,
             clk_domain=cluster.l1_cache.clk_domain,
+            ruby_system=self._ruby_system,
         )
 
         if self._board.has_io_bus():
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/octopi.py b/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/octopi.py
index f7d4d63de1..83137ce15a 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/octopi.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/octopi.py
@@ -151,7 +151,9 @@ class OctopiCache(
 
         # Set up a proxy port for the system_port. Used for load binaries and
         # other functional-only things.
-        self.ruby_system.sys_port_proxy = RubyPortProxy()
+        self.ruby_system.sys_port_proxy = RubyPortProxy(
+            ruby_system=self.ruby_system
+        )
         board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)
 
     def _create_directory_controllers(self, board):
@@ -228,7 +230,11 @@ class OctopiCache(
         if board.has_dma_ports():
             self.ruby_system.dma_controllers = [
                 DMAController(
-                    dma_sequencer=DMASequencer(version=i + 1, in_ports=port),
+                    dma_sequencer=DMASequencer(
+                        version=i + 1,
+                        in_ports=port,
+                        ruby_system=self.ruby_system,
+                    ),
                     ruby_system=self.ruby_system,
                 )
                 for i, port in enumerate(board.get_dma_ports())
diff --git a/src/python/gem5/components/cachehierarchies/ruby/mesi_three_level_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/ruby/mesi_three_level_cache_hierarchy.py
index 66fea95636..92e8860a24 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/mesi_three_level_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/mesi_three_level_cache_hierarchy.py
@@ -118,6 +118,7 @@ class MESIThreeLevelCacheHierarchy(
                 version=core_idx,
                 dcache=l1_cache.Dcache,
                 clk_domain=l1_cache.clk_domain,
+                ruby_system=self.ruby_system,
             )
 
             if board.has_io_bus():
@@ -196,7 +197,12 @@ class MESIThreeLevelCacheHierarchy(
             dma_ports = board.get_dma_ports()
             for i, port in enumerate(dma_ports):
                 ctrl = DMAController(
-                    DMASequencer(version=i, in_ports=port), self.ruby_system
+                    DMASequencer(
+                        version=i,
+                        in_ports=port,
+                        ruby_system=self.ruby_system,
+                    ),
+                    self.ruby_system,
                 )
                 self._dma_controllers.append(ctrl)
 
@@ -223,5 +229,7 @@ class MESIThreeLevelCacheHierarchy(
 
         # Set up a proxy port for the system_port. Used for load binaries and
         # other functional-only things.
-        self.ruby_system.sys_port_proxy = RubyPortProxy()
+        self.ruby_system.sys_port_proxy = RubyPortProxy(
+            ruby_system=self.ruby_system
+        )
         board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)
diff --git a/src/python/gem5/components/cachehierarchies/ruby/mesi_two_level_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/ruby/mesi_two_level_cache_hierarchy.py
index 004c2ff9d2..efe714c23c 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/mesi_two_level_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/mesi_two_level_cache_hierarchy.py
@@ -109,7 +109,10 @@ class MESITwoLevelCacheHierarchy(
             )
 
             cache.sequencer = RubySequencer(
-                version=i, dcache=cache.L1Dcache, clk_domain=cache.clk_domain
+                version=i,
+                dcache=cache.L1Dcache,
+                clk_domain=cache.clk_domain,
+                ruby_system=self.ruby_system,
             )
 
             if board.has_io_bus():
@@ -163,7 +166,11 @@ class MESITwoLevelCacheHierarchy(
             dma_ports = board.get_dma_ports()
             for i, port in enumerate(dma_ports):
                 ctrl = DMAController(self.ruby_system.network, cache_line_size)
-                ctrl.dma_sequencer = DMASequencer(version=i, in_ports=port)
+                ctrl.dma_sequencer = DMASequencer(
+                    version=i,
+                    in_ports=port,
+                    ruby_system=self.ruby_system,
+                )
                 self._dma_controllers.append(ctrl)
                 ctrl.ruby_system = self.ruby_system
 
@@ -188,5 +195,7 @@ class MESITwoLevelCacheHierarchy(
 
         # Set up a proxy port for the system_port. Used for load binaries and
         # other functional-only things.
-        self.ruby_system.sys_port_proxy = RubyPortProxy()
+        self.ruby_system.sys_port_proxy = RubyPortProxy(
+            ruby_system=self.ruby_system
+        )
         board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)
diff --git a/src/python/gem5/components/cachehierarchies/ruby/mi_example_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/ruby/mi_example_cache_hierarchy.py
index 478c793560..56e620ff0c 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/mi_example_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/mi_example_cache_hierarchy.py
@@ -95,6 +95,7 @@ class MIExampleCacheHierarchy(AbstractRubyCacheHierarchy):
                 version=i,
                 dcache=cache.cacheMemory,
                 clk_domain=cache.clk_domain,
+                ruby_system=self.ruby_system,
             )
 
             if board.has_io_bus():
@@ -140,7 +141,11 @@ class MIExampleCacheHierarchy(AbstractRubyCacheHierarchy):
                 ctrl = DMAController(
                     self.ruby_system.network, board.get_cache_line_size()
                 )
-                ctrl.dma_sequencer = DMASequencer(version=i, in_ports=port)
+                ctrl.dma_sequencer = DMASequencer(
+                    version=i,
+                    in_ports=port,
+                    ruby_system=self.ruby_system,
+                )
 
                 ctrl.ruby_system = self.ruby_system
                 ctrl.dma_sequencer.ruby_system = self.ruby_system
@@ -167,5 +172,7 @@ class MIExampleCacheHierarchy(AbstractRubyCacheHierarchy):
 
         # Set up a proxy port for the system_port. Used for load binaries and
         # other functional-only things.
-        self.ruby_system.sys_port_proxy = RubyPortProxy()
+        self.ruby_system.sys_port_proxy = RubyPortProxy(
+            ruby_system=self.ruby_system
+        )
         board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)

From 67edf64326788e0152a0f6adcf71aacd885c92f5 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Chang <rogerycchang@google.com>
Date: Tue, 8 Oct 2024 23:51:20 +0800
Subject: [PATCH 15/47] arch-riscv: Fix CLINT mtime reset handling (#1638)

The previous https://github.com/gem5/gem5/pull/1617 introduce the CLINT
reset feature. When reset, we changed the mtime to 0 and keep mtimecmp
unchanged by default, we also need to check mtime & mtimecmp regiter to
update the MTI signal. However, the mtime register will be incremented
to 1 by `raiseInterruptPin`.

In the PR, we introduced the interrupt ID for CLINT, the mtime will be
incremented only if received the RTC signal

---------

Co-authored-by: Jason Lowe-Power <jason@lowepower.com>
---
 src/dev/riscv/clint.cc | 10 ++++++----
 src/dev/riscv/clint.hh |  7 +++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/dev/riscv/clint.cc b/src/dev/riscv/clint.cc
index fc959aced4..a18555fc87 100644
--- a/src/dev/riscv/clint.cc
+++ b/src/dev/riscv/clint.cc
@@ -53,7 +53,7 @@ Clint::Clint(const Params &params) :
     BasicPioDevice(params, params.pio_size),
     system(params.system),
     nThread(params.num_threads),
-    signal(params.name + ".signal", 0, this),
+    signal(params.name + ".signal", 0, this, INT_RTC),
     reset(params.name + ".reset"),
     resetMtimecmp(params.reset_mtimecmp),
     registers(params.name + ".registers", params.pio_addr, this,
@@ -69,9 +69,11 @@ Clint::Clint(const Params &params) :
 void
 Clint::raiseInterruptPin(int id)
 {
-    // Increment mtime
+    // Increment mtime when received RTC signal
     uint64_t& mtime = registers.mtime.get();
-    mtime++;
+    if (id == INT_RTC) {
+        mtime++;
+    }
 
     for (int context_id = 0; context_id < nThread; context_id++) {
 
@@ -261,7 +263,7 @@ Clint::doReset() {
         registers.msip[i].reset();
     }
     // We need to update the mtip interrupt bits when reset
-    raiseInterruptPin(0);
+    raiseInterruptPin(INT_RESET);
 }
 
 } // namespace gem5
diff --git a/src/dev/riscv/clint.hh b/src/dev/riscv/clint.hh
index 38f2117a16..2478eee0db 100644
--- a/src/dev/riscv/clint.hh
+++ b/src/dev/riscv/clint.hh
@@ -91,6 +91,13 @@ class Clint : public BasicPioDevice
     void raiseInterruptPin(int id);
     void lowerInterruptPin(int id) {}
 
+  // Interrupt ID
+  enum InterruptId
+  {
+      INT_RTC = 0, // received from RTC(signal port)
+      INT_RESET, // received from reset port
+  };
+
   // Register bank
   public:
 

From 402a030ce1590dafe6721e6eed42de3ae5245983 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Chang <rogerycchang@google.com>
Date: Tue, 8 Oct 2024 23:51:38 +0800
Subject: [PATCH 16/47] cpu,arch,arch-riscv: Check wake up signal when post
 interrupt (#1641)

The RISC-V doesn't not draft about how to handle wake up from interrupt
signal. In SiFive U74 core, the hart will wake up if there is any
enabled pending interrupt.

[1] Section 14.3.1
https://sifive.cdn.prismic.io/sifive/ad5577a0-9a00-45c9-a5d0-424a3d586060_u74_core_complex_manual_21G3.pdf
---
 src/arch/generic/interrupts.hh | 6 ++++++
 src/arch/riscv/interrupts.hh   | 5 +++++
 src/cpu/base.cc                | 6 +++++-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/arch/generic/interrupts.hh b/src/arch/generic/interrupts.hh
index 510775594e..c2ffce038d 100644
--- a/src/arch/generic/interrupts.hh
+++ b/src/arch/generic/interrupts.hh
@@ -89,6 +89,12 @@ class BaseInterrupts : public SimObject
     {
         panic("Interrupts::clearAll unimplemented!\n");
     }
+
+    virtual bool
+    isWakeUp() const
+    {
+        return true;
+    }
 };
 
 } // namespace gem5
diff --git a/src/arch/riscv/interrupts.hh b/src/arch/riscv/interrupts.hh
index a10479fb65..54cf501f0a 100644
--- a/src/arch/riscv/interrupts.hh
+++ b/src/arch/riscv/interrupts.hh
@@ -95,6 +95,11 @@ class Interrupts : public BaseInterrupts
 
     void clearAll() override;
 
+    bool isWakeUp() const override
+    {
+        return checkNonMaskableInterrupt() || (ip & ie).any();
+    }
+
     uint64_t readIP() const { return (uint64_t)ip.to_ulong(); }
     uint64_t readIE() const { return (uint64_t)ie.to_ulong(); }
     void setIP(const uint64_t& val) { ip = val; }
diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index ec219aa9f1..94d1a6e8e3 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -240,7 +240,11 @@ BaseCPU::postInterrupt(ThreadID tid, int int_num, int index)
     // Only wake up syscall emulation if it is not waiting on a futex.
     // This is to model the fact that instructions such as ARM SEV
     // should wake up a WFE sleep, but not a futex syscall WAIT.
-    if (FullSystem || !system->futexMap.is_waiting(threadContexts[tid]))
+    //
+    // For RISC-V, the WFI sleep wake up is implementation defined.
+    // The SiFive WFI wake up the hart only if mip & mie != 0
+    if ((FullSystem && interrupts[tid]->isWakeUp()) ||
+        !system->futexMap.is_waiting(threadContexts[tid]))
         wakeup(tid);
 }
 

From cc0eb12e9a5fbc250ab14b71e2876f1f3ef6e6d1 Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Wed, 9 Oct 2024 06:24:57 -0700
Subject: [PATCH 17/47] misc,tests: Add cache of ALL/gem5.opt to ci-test.yaml
 (#1595)

Where appropriate utilize caching of ALL/gem5.opt or VEGA_X86/gem5.opt.
The cache key is just the date returned by the runner. This is unlikely
the most efficient solution but it is simple and difficulties were
encountered when attempting to create a hash of  This solution will do
for now.
---
 .github/workflows/ci-tests.yaml | 63 +++++++++++++++++++++++++++++----
 1 file changed, 57 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci-tests.yaml b/.github/workflows/ci-tests.yaml
index 8e828675af..4a4ecb5ba4 100644
--- a/.github/workflows/ci-tests.yaml
+++ b/.github/workflows/ci-tests.yaml
@@ -21,17 +21,48 @@ jobs:
             - uses: actions/setup-python@v5
             - uses: pre-commit/action@v3.0.1
 
+    get-date:
+    # We use the date to label caches. A cache is a a "hit" if the date is the
+    # request binary and date are the same as what is stored in the cache.
+    # This essentially means the first job to run on a given day for a given
+    # binary will always be a "miss" and will have to build the binary then
+    # upload it as that day's binary to upload. While this isn't the most
+    # efficient way to do this, the alternative was to run take a hash of the
+    # `src` directory contents and use it as a hash. We found there to be bugs
+    # with the hash function where this task would timeout. This approach is
+    # simple, works, and still provides some level of caching.
+        runs-on: ubuntu-latest
+        outputs:
+            date: ${{ steps.date.outputs.date }}
+        steps:
+            - name: Get the current date
+              id: date
+              run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+
     unittests-all-opt:
         runs-on: [self-hosted, linux, x64]
         if: github.event.pull_request.draft == false
         container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
-        needs: [pre-commit] # only runs if pre-commit passes.
+        needs: [pre-commit, get-date] # only runs if pre-commit passes.
         timeout-minutes: 60
         steps:
             - uses: actions/checkout@v4
+
+
+            # Restore the cache if available. As this just builds the unittests
+            # we only obtain the cache and do not provide if if is not
+            # available.
+            - name: Cache build/ALL
+              uses: actions/cache/restore@v4
+              with:
+                  path: build/ALL
+                  key: testlib-build-all-${{ env.date }}
+                  restore-keys: |
+                      testlib-build-all
+
             - name: CI Unittests
               working-directory: ${{ github.workspace }}
-              run: scons build/ALL/unittests.opt -j $(nproc)
+              run: scons --no-compress-debug build/ALL/unittests.opt -j $(nproc)
             - run: echo "This job's status is ${{ job.status }}."
 
     testlib-quick-matrix:
@@ -83,14 +114,24 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         if: github.event.pull_request.draft == false
         container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
-        needs: [pre-commit, testlib-quick-matrix]
+        needs: [pre-commit, testlib-quick-matrix, get-date]
         strategy:
             matrix:
                 build-target: ${{ fromJson(needs.testlib-quick-matrix.outputs.build-matrix) }}
         steps:
             - uses: actions/checkout@v4
+
+            - name: Cache build/ALL
+              uses: actions/cache@v4
+              if: ${{ endsWith(matrix.build-target, 'build/ALL/gem5.opt') }}
+              with:
+                  path: build/ALL
+                  key: testlib-build-all-${{ env.date }}
+                  restore-keys: |
+                      testlib-build-all
+
             - name: Build gem5
-              run: scons ${{ matrix.build-target }} -j $(nproc)
+              run: scons --no-compress-debug ${{ matrix.build-target }} -j $(nproc)
 
         # Upload the gem5 binary as an artifact.
         # Note: the "achor.txt" file is a hack to make sure the paths are
@@ -199,13 +240,23 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         container: ghcr.io/gem5/gcn-gpu:latest
         timeout-minutes: 180
-        needs: [pre-commit]
+        needs: [pre-commit, get-date]
         steps:
             - uses: actions/checkout@v4
 
+            # Obtain the cache if available. If not available this will upload
+            # this job's instance of the cache.
+            - name: Cache build/VEGA_X86
+              uses: actions/cache@v4
+              with:
+                  path: build/VEGA_X86
+                  key: testlib-build-vega-${{ env.date }}
+                  restore-keys: |
+                      testlib-build-vega
+
             # Build the VEGA_X86/gem5.opt binary.
             - name: Build VEGA_X86/gem5.opt
-              run: scons build/VEGA_X86/gem5.opt -j`nproc`
+              run: scons --no-compress-debug build/VEGA_X86/gem5.opt -j`nproc`
 
             # Run the GPU tests.
             - name: Run Testlib GPU Tests

From ee91356632835b54aefecdfa143601a9a6a996ce Mon Sep 17 00:00:00 2001
From: wmin0 <yuhsingw@google.com>
Date: Wed, 9 Oct 2024 21:28:43 +0800
Subject: [PATCH 18/47] systemc: Disable 'overloaded-virtual' warn for systemc
 bind funcs (#1637)

For GCC >=v13 systemc was breaking due to the overloaded virtual warning
check.

Issue: gem5#1121

Change-Id: I68872f58d0bbe5430976163ba7316bbd2e403ec8
---
 src/systemc/ext/core/sc_export.hh              | 12 ++++++++++++
 src/systemc/ext/core/sc_port.hh                | 17 ++++++++++++-----
 .../ext/tlm_core/2/sockets/initiator_socket.hh | 18 +++++++++++++-----
 .../ext/tlm_core/2/sockets/target_socket.hh    |  1 +
 4 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/src/systemc/ext/core/sc_export.hh b/src/systemc/ext/core/sc_export.hh
index c93f01a9a3..f231968e77 100644
--- a/src/systemc/ext/core/sc_export.hh
+++ b/src/systemc/ext/core/sc_export.hh
@@ -70,6 +70,17 @@ class sc_export : public sc_export_base
 
     virtual const char *kind() const override { return "sc_export"; }
 
+#pragma GCC diagnostic push
+/**
+ * The following warning is disabled because the bind methods are overloaded
+ * in the derived class and the base class. In GCC v13+ this
+ * 'overloaded-virtual' warning is strict enough to trigger here (though the
+ * code is correct).
+ * Please check section 9.3 of SystemC 2.3.1 release note for more details.
+ */
+#if defined(__GNUC__) && (__GNUC__ >= 13)
+#pragma GCC diagnostic ignored "-Woverloaded-virtual"
+#endif
     void operator () (IF &i) { bind(i); }
     virtual void
     bind(IF &i)
@@ -80,6 +91,7 @@ class sc_export : public sc_export_base
         }
         interface = &i;
     }
+#pragma GCC diagnostic pop
     operator IF & ()
     {
         if (!interface)
diff --git a/src/systemc/ext/core/sc_port.hh b/src/systemc/ext/core/sc_port.hh
index bd57553559..bf00cb9361 100644
--- a/src/systemc/ext/core/sc_port.hh
+++ b/src/systemc/ext/core/sc_port.hh
@@ -114,19 +114,27 @@ class sc_port_base : public sc_object
     virtual sc_port_policy _portPolicy() const = 0;
 };
 
-// The overloaded virtual is intended in SystemC, so we'll disable the warning.
-// Please check section 9.3 of SystemC 2.3.1 release note for more details.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Woverloaded-virtual"
 template <class IF>
 class sc_port_b : public sc_port_base
 {
   public:
+#pragma GCC diagnostic push
+/**
+ * The following warning is disabled because the bind methods are overloaded
+ * in the derived class and the base class. In GCC v13+ this
+ * 'overloaded-virtual' warning is strict enough to trigger here (though the
+ * code is correct).
+ * Please check section 9.3 of SystemC 2.3.1 release note for more details.
+ */
+#if defined(__GNUC__) && (__GNUC__ >= 13)
+#pragma GCC diagnostic ignored "-Woverloaded-virtual"
+#endif
     void operator () (IF &i) { bind(i); }
     void operator () (sc_port_b<IF> &p) { bind(p); }
 
     virtual void bind(IF &i) { sc_port_base::bind(i); }
     virtual void bind(sc_port_b<IF> &p) { sc_port_base::bind(p); }
+#pragma GCC diagnostic pop
 
     IF *
     operator -> ()
@@ -248,7 +256,6 @@ class sc_port_b : public sc_port_base
     sc_port_b(const sc_port_b<IF> &) {}
     sc_port_b<IF> &operator = (const sc_port_b<IF> &) { return *this; }
 };
-#pragma GCC diagnostic pop
 
 template <class IF, int N=1, sc_port_policy P=SC_ONE_OR_MORE_BOUND>
 class sc_port : public sc_port_b<IF>
diff --git a/src/systemc/ext/tlm_core/2/sockets/initiator_socket.hh b/src/systemc/ext/tlm_core/2/sockets/initiator_socket.hh
index 4f67b59237..2bb97f7945 100644
--- a/src/systemc/ext/tlm_core/2/sockets/initiator_socket.hh
+++ b/src/systemc/ext/tlm_core/2/sockets/initiator_socket.hh
@@ -51,10 +51,6 @@ template <unsigned int BUSWIDTH, typename FW_IF, typename BW_IF, int N,
           sc_core::sc_port_policy POL>
 class tlm_base_target_socket;
 
-// The overloaded virtual is intended in SystemC, so we'll disable the warning.
-// Please check section 9.3 of SystemC 2.3.1 release note for more details.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Woverloaded-virtual"
 template <unsigned int BUSWIDTH=32, typename FW_IF=tlm_fw_transport_if<>,
           typename BW_IF=tlm_bw_transport_if<>, int N=1,
           sc_core::sc_port_policy POL=sc_core::SC_ONE_OR_MORE_BOUND>
@@ -100,6 +96,18 @@ class tlm_base_initiator_socket :
     // - Binds the port of the target socket to the export of the initiator
     //   socket
     //
+
+#pragma GCC diagnostic push
+/**
+ * The following warning is disabled because the bind methods are overloaded
+ * in the derived class and the base class. In GCC v13+ this
+ * 'overloaded-virtual' warning is strict enough to trigger here (though the
+ * code is correct).
+ * Please check section 9.3 of SystemC 2.3.1 release note for more details.
+ */
+#if defined(__GNUC__) && (__GNUC__ >= 13)
+#pragma GCC diagnostic ignored "-Woverloaded-virtual"
+#endif
     virtual void
     bind(base_target_socket_type &s)
     {
@@ -132,6 +140,7 @@ class tlm_base_initiator_socket :
     //
     virtual void bind(bw_interface_type &ifs) { (get_base_export())(ifs); }
     void operator() (bw_interface_type &s) { bind(s); }
+#pragma GCC diagnostic pop
 
     // Implementation of tlm_base_socket_if functions
     virtual sc_core::sc_port_base &get_port_base() { return *this; }
@@ -174,7 +183,6 @@ class tlm_base_initiator_socket :
   protected:
     export_type m_export;
 };
-#pragma GCC diagnostic pop
 
 //
 // Convenience socket classes
diff --git a/src/systemc/ext/tlm_core/2/sockets/target_socket.hh b/src/systemc/ext/tlm_core/2/sockets/target_socket.hh
index 5da81d892e..3f5cb98ae4 100644
--- a/src/systemc/ext/tlm_core/2/sockets/target_socket.hh
+++ b/src/systemc/ext/tlm_core/2/sockets/target_socket.hh
@@ -98,6 +98,7 @@ class tlm_base_target_socket :
  * in the derived class and the base class. In GCC v13+ this
  * 'overloaded-virtual' warning is strict enough to trigger here (though the
  * code is correct).
+ * Please check section 9.3 of SystemC 2.3.1 release note for more details.
  */
 #if defined(__GNUC__) && (__GNUC__ >= 13)
 #pragma GCC diagnostic ignored "-Woverloaded-virtual"

From 11fa0ac9a5ea9e2ef5814be0a7e433698c47a633 Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Wed, 9 Oct 2024 05:06:13 -0700
Subject: [PATCH 19/47] stdlib: Mv setup_board/setup_mem_ranges calls to set_fs

This change allows for the `_setup_memory_range` and `_setup_board`
functions to know if the board is to run a FS or SE workload, thus
allowing for a baord to handle both cases considerably easier than
before. With this change all functions are called after FS or SE
is declared via the `_set_fullsystem` function and thus all can
accomodate for SE and FS workloads.
---
 .../gem5/components/boards/abstract_board.py  | 38 +++++++++++--------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/src/python/gem5/components/boards/abstract_board.py b/src/python/gem5/components/boards/abstract_board.py
index 5819adc9de..cd6f559937 100644
--- a/src/python/gem5/components/boards/abstract_board.py
+++ b/src/python/gem5/components/boards/abstract_board.py
@@ -118,12 +118,6 @@ class AbstractBoard:
         # Simulator module.
         self._checkpoint = None
 
-        # Setup the board and memory system's memory ranges.
-        self._setup_memory_ranges()
-
-        # Setup board properties unique to the board being constructed.
-        self._setup_board()
-
         # A private variable to record whether `_connect_things` has been
         # been called.
         self._connect_things_called = False
@@ -195,6 +189,9 @@ class AbstractBoard:
         """
         self._is_fs = is_fs
 
+        self._setup_memory_ranges()
+        self._setup_board()
+
     def is_fullsystem(self) -> bool:
         """
         Returns ``True`` if the board is to be run in FS mode. Otherwise the board
@@ -253,11 +250,14 @@ class AbstractBoard:
     @abstractmethod
     def _setup_board(self) -> None:
         """
-        This function is called in the AbstractBoard constructor, before the
-        memory, processor, and cache hierarchy components are incorporated via
-        ``_connect_thing()``, but after the ``_setup_memory_ranges()`` function.
-        This function should be overridden by boards to specify components,
-        connections unique to that board.
+        This function is called at the end of `_set_fullsystem`. The reason for
+        this is the board's configuraiton varies significantly depending on
+        whether it is to be run in FS or SE mode. This function is therefore
+        called when a workload is set --- after construction but before
+        `_pre_instantiate` is called.
+
+        As `_setup_memory_ranges()` is set in the constructor, this function
+        can be considered to have been called prior to `_setup_board
         """
         raise NotImplementedError
 
@@ -331,10 +331,18 @@ class AbstractBoard:
         """
         Set the memory ranges for this board and memory system.
 
-        This is called in the constructor, prior to ``_setup_board`` and
-        ``_connect_things``. It should query the board's memory to determine the
-        size and the set the memory ranges on the memory system and on the
-        board.
+        This is called at the end of the `_set_fullsystem` function but before
+        `_setup_board`.  `_set_fullsystem` is called when the workload is
+        declared. It is before `_pre_instantiate` (but, obviously after
+        construction).
+
+        It should query the board's memory
+        to determine the size and the set the memory ranges on the memory
+        system and on the board.
+
+        As thisis called at the end of `_set_fullsystem`, the board's memory
+        can be setup differently depending on whether the board is to be run in
+        FS or SE mode.
 
         The simplest implementation sets the board's memory range to the size
         of memory and memory system's range to be the same as the board. Full

From 7661116b0083054317b157a02b974f439e971cdb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 7 Oct 2024 21:48:59 +0000
Subject: [PATCH 20/47] misc: [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/pre-commit/pre-commit-hooks: v4.5.0 → v5.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.5.0...v5.0.0)
- [github.com/PyCQA/isort: 5.11.5 → 5.13.2](https://github.com/PyCQA/isort/compare/5.11.5...5.13.2)
- [github.com/psf/black: 23.9.1 → 24.10.0](https://github.com/psf/black/compare/23.9.1...24.10.0)
- [github.com/asottile/pyupgrade: v3.14.0 → v3.17.0](https://github.com/asottile/pyupgrade/compare/v3.14.0...v3.17.0)
---
 .pre-commit-config.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7e17adca7f..03e39a3639 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -49,11 +49,11 @@ exclude: |
       tests/.*/ref/.*
     )$
 
-default_stages: [commit]
+default_stages: [pre-commit]
 
 repos:
     - repo: https://github.com/pre-commit/pre-commit-hooks
-      rev: v4.5.0
+      rev: v5.0.0
       hooks:
           - id: trailing-whitespace
           - id: end-of-file-fixer
@@ -69,7 +69,7 @@ repos:
           - id: destroyed-symlinks
           - id: requirements-txt-fixer
     - repo: https://github.com/PyCQA/isort
-      rev: 5.11.5
+      rev: 5.13.2
       hooks:
           - id: isort
     - repo: https://github.com/jumanjihouse/pre-commit-hook-yamlfmt
@@ -77,11 +77,11 @@ repos:
       hooks:
           - id: yamlfmt
     - repo: https://github.com/psf/black
-      rev: 23.9.1
+      rev: 24.10.0
       hooks:
           - id: black
     - repo: https://github.com/asottile/pyupgrade
-      rev: v3.14.0
+      rev: v3.17.0
       hooks:
           - id: pyupgrade
             # Python 3.8 is the earliest version supported.

From 54487d3bf6aa0ff7fd11f0a40a2d1756d887c371 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 9 Oct 2024 14:04:51 +0000
Subject: [PATCH 21/47] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 configs/common/HMC.py                         | 56 +++++++++----------
 configs/common/Simulation.py                  |  6 +-
 configs/deprecated/example/fs.py              |  6 +-
 configs/example/apu_se.py                     |  6 +-
 configs/example/gpufs/system/system.py        | 12 ++--
 configs/example/read_config.py                |  8 ++-
 src/arch/isa_parser/isa_parser.py             | 13 +++--
 src/arch/micro_asm.py                         | 12 ++--
 .../boards/abstract_system_board.py           |  1 -
 .../gem5/components/boards/test_board.py      |  1 -
 .../stats/configs/pystat_vector2d_check.py    |  8 ++-
 util/minorview/model.py                       | 12 ++--
 12 files changed, 71 insertions(+), 70 deletions(-)

diff --git a/configs/common/HMC.py b/configs/common/HMC.py
index 98ff091115..0dfbebb3e5 100644
--- a/configs/common/HMC.py
+++ b/configs/common/HMC.py
@@ -568,9 +568,9 @@ def config_hmc_dev(opt, system, hmc_host):
     # Attach 4 serial link to 4 crossbar/s
     for i in range(opt.num_serial_links):
         if opt.enable_link_monitor:
-            system.hmc_host.seriallink[
-                i
-            ].mem_side_port = system.hmc_dev.lmonitor[i].cpu_side_port
+            system.hmc_host.seriallink[i].mem_side_port = (
+                system.hmc_dev.lmonitor[i].cpu_side_port
+            )
             system.hmc_dev.lmonitor[i].mem_side_port = system.hmc_dev.xbar[
                 i
             ].cpu_side_ports
@@ -613,14 +613,12 @@ def config_hmc_dev(opt, system, hmc_host):
                     ]
 
                     # Connect the bridge between corssbars
-                    system.hmc_dev.xbar[
-                        i
-                    ].mem_side_ports = system.hmc_dev.buffers[
-                        index
-                    ].cpu_side_port
-                    system.hmc_dev.buffers[
-                        index
-                    ].mem_side_port = system.hmc_dev.xbar[j].cpu_side_ports
+                    system.hmc_dev.xbar[i].mem_side_ports = (
+                        system.hmc_dev.buffers[index].cpu_side_port
+                    )
+                    system.hmc_dev.buffers[index].mem_side_port = (
+                        system.hmc_dev.xbar[j].cpu_side_ports
+                    )
                 else:
                     # Don't connect the xbar to itself
                     pass
@@ -629,49 +627,49 @@ def config_hmc_dev(opt, system, hmc_host):
     # can only direct traffic to it local vaults
     if opt.arch == "mixed":
         system.hmc_dev.buffer30 = Bridge(ranges=system.mem_ranges[0:4])
-        system.hmc_dev.xbar[
-            3
-        ].mem_side_ports = system.hmc_dev.buffer30.cpu_side_port
+        system.hmc_dev.xbar[3].mem_side_ports = (
+            system.hmc_dev.buffer30.cpu_side_port
+        )
         system.hmc_dev.buffer30.mem_side_port = system.hmc_dev.xbar[
             0
         ].cpu_side_ports
 
         system.hmc_dev.buffer31 = Bridge(ranges=system.mem_ranges[4:8])
-        system.hmc_dev.xbar[
-            3
-        ].mem_side_ports = system.hmc_dev.buffer31.cpu_side_port
+        system.hmc_dev.xbar[3].mem_side_ports = (
+            system.hmc_dev.buffer31.cpu_side_port
+        )
         system.hmc_dev.buffer31.mem_side_port = system.hmc_dev.xbar[
             1
         ].cpu_side_ports
 
         system.hmc_dev.buffer32 = Bridge(ranges=system.mem_ranges[8:12])
-        system.hmc_dev.xbar[
-            3
-        ].mem_side_ports = system.hmc_dev.buffer32.cpu_side_port
+        system.hmc_dev.xbar[3].mem_side_ports = (
+            system.hmc_dev.buffer32.cpu_side_port
+        )
         system.hmc_dev.buffer32.mem_side_port = system.hmc_dev.xbar[
             2
         ].cpu_side_ports
 
         system.hmc_dev.buffer20 = Bridge(ranges=system.mem_ranges[0:4])
-        system.hmc_dev.xbar[
-            2
-        ].mem_side_ports = system.hmc_dev.buffer20.cpu_side_port
+        system.hmc_dev.xbar[2].mem_side_ports = (
+            system.hmc_dev.buffer20.cpu_side_port
+        )
         system.hmc_dev.buffer20.mem_side_port = system.hmc_dev.xbar[
             0
         ].cpu_side_ports
 
         system.hmc_dev.buffer21 = Bridge(ranges=system.mem_ranges[4:8])
-        system.hmc_dev.xbar[
-            2
-        ].mem_side_ports = system.hmc_dev.buffer21.cpu_side_port
+        system.hmc_dev.xbar[2].mem_side_ports = (
+            system.hmc_dev.buffer21.cpu_side_port
+        )
         system.hmc_dev.buffer21.mem_side_port = system.hmc_dev.xbar[
             1
         ].cpu_side_ports
 
         system.hmc_dev.buffer23 = Bridge(ranges=system.mem_ranges[12:16])
-        system.hmc_dev.xbar[
-            2
-        ].mem_side_ports = system.hmc_dev.buffer23.cpu_side_port
+        system.hmc_dev.xbar[2].mem_side_ports = (
+            system.hmc_dev.buffer23.cpu_side_port
+        )
         system.hmc_dev.buffer23.mem_side_port = system.hmc_dev.xbar[
             3
         ].cpu_side_ports
diff --git a/configs/common/Simulation.py b/configs/common/Simulation.py
index 3e332d76b4..be928651ae 100644
--- a/configs/common/Simulation.py
+++ b/configs/common/Simulation.py
@@ -541,9 +541,9 @@ def run(options, root, testsys, cpu_class):
                 IndirectBPClass = ObjectList.indirect_bp_list.get(
                     options.indirect_bp_type
                 )
-                switch_cpus[
-                    i
-                ].branchPred.indirectBranchPred = IndirectBPClass()
+                switch_cpus[i].branchPred.indirectBranchPred = (
+                    IndirectBPClass()
+                )
             switch_cpus[i].createThreads()
 
         # If elastic tracing is enabled attach the elastic trace probe
diff --git a/configs/deprecated/example/fs.py b/configs/deprecated/example/fs.py
index 7426c47c7e..df77b6d830 100644
--- a/configs/deprecated/example/fs.py
+++ b/configs/deprecated/example/fs.py
@@ -213,9 +213,9 @@ def build_test_system(np, isa: ISA):
                     IndirectBPClass = ObjectList.indirect_bp_list.get(
                         args.indirect_bp_type
                     )
-                    test_sys.cpu[
-                        i
-                    ].branchPred.indirectBranchPred = IndirectBPClass()
+                    test_sys.cpu[i].branchPred.indirectBranchPred = (
+                        IndirectBPClass()
+                    )
             test_sys.cpu[i].createThreads()
 
         # If elastic tracing is enabled when not restoring from checkpoint and
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index 1ae6edf391..d512594afe 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -935,9 +935,9 @@ gpu_port_idx = gpu_port_idx - args.num_cp * 2
 token_port_idx = 0
 for i in range(len(system.ruby._cpu_ports)):
     if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer):
-        system.cpu[shader_idx].CUs[
-            token_port_idx
-        ].gmTokenPort = system.ruby._cpu_ports[i].gmTokenPort
+        system.cpu[shader_idx].CUs[token_port_idx].gmTokenPort = (
+            system.ruby._cpu_ports[i].gmTokenPort
+        )
         token_port_idx += 1
 
 wavefront_size = args.wf_size
diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py
index 1ce261d764..b650659303 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -336,9 +336,9 @@ def makeGpuFSSystem(args):
     token_port_idx = 0
     for i in range(len(system.ruby._cpu_ports)):
         if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer):
-            system.cpu[shader_idx].CUs[
-                token_port_idx
-            ].gmTokenPort = system.ruby._cpu_ports[i].gmTokenPort
+            system.cpu[shader_idx].CUs[token_port_idx].gmTokenPort = (
+                system.ruby._cpu_ports[i].gmTokenPort
+            )
             token_port_idx += 1
 
     wavefront_size = args.wf_size
@@ -346,9 +346,9 @@ def makeGpuFSSystem(args):
         # The pipeline issues wavefront_size number of uncoalesced requests
         # in one GPU issue cycle. Hence wavefront_size mem ports.
         for j in range(wavefront_size):
-            system.cpu[shader_idx].CUs[i].memory_port[
-                j
-            ] = system.ruby._cpu_ports[gpu_port_idx].in_ports[j]
+            system.cpu[shader_idx].CUs[i].memory_port[j] = (
+                system.ruby._cpu_ports[gpu_port_idx].in_ports[j]
+            )
         gpu_port_idx += 1
 
     for i in range(args.num_compute_units):
diff --git a/configs/example/read_config.py b/configs/example/read_config.py
index 27e23b69ee..9f86c3af49 100644
--- a/configs/example/read_config.py
+++ b/configs/example/read_config.py
@@ -250,9 +250,11 @@ class ConfigManager:
                         obj,
                         param_name,
                         [
-                            self.objects_by_name[name]
-                            if name != "Null"
-                            else m5.params.NULL
+                            (
+                                self.objects_by_name[name]
+                                if name != "Null"
+                                else m5.params.NULL
+                            )
                             for name in param_values
                         ],
                     )
diff --git a/src/arch/isa_parser/isa_parser.py b/src/arch/isa_parser/isa_parser.py
index 7cc95ed6e8..0499beab83 100755
--- a/src/arch/isa_parser/isa_parser.py
+++ b/src/arch/isa_parser/isa_parser.py
@@ -111,11 +111,12 @@ class Template:
 
             operands = SubOperandList(self.parser, compositeCode, d.operands)
 
-            myDict[
-                "reg_idx_arr_decl"
-            ] = "RegId srcRegIdxArr[%d]; RegId destRegIdxArr[%d]" % (
-                d.operands.numSrcRegs + d.srcRegIdxPadding,
-                d.operands.numDestRegs + d.destRegIdxPadding,
+            myDict["reg_idx_arr_decl"] = (
+                "RegId srcRegIdxArr[%d]; RegId destRegIdxArr[%d]"
+                % (
+                    d.operands.numSrcRegs + d.srcRegIdxPadding,
+                    d.operands.numDestRegs + d.destRegIdxPadding,
+                )
             )
 
             # The reinterpret casts are largely because an array with a known
@@ -821,7 +822,7 @@ class ISAParser(Grammar):
         "DBLCOLON",
         "ASTERISK",
         # C preprocessor directives
-        "CPPDIRECTIVE"
+        "CPPDIRECTIVE",
         # The following are matched but never returned. commented out to
         # suppress PLY warning
         # newfile directive
diff --git a/src/arch/micro_asm.py b/src/arch/micro_asm.py
index 0329800896..5b4f79fce3 100644
--- a/src/arch/micro_asm.py
+++ b/src/arch/micro_asm.py
@@ -140,9 +140,9 @@ def handle_statement(parser, container, statement):
     if statement.is_microop:
         if statement.mnemonic not in parser.microops.keys():
             raise Exception(f"Unrecognized mnemonic: {statement.mnemonic}")
-        parser.symbols[
-            "__microopClassFromInsideTheAssembler"
-        ] = parser.microops[statement.mnemonic]
+        parser.symbols["__microopClassFromInsideTheAssembler"] = (
+            parser.microops[statement.mnemonic]
+        )
         try:
             microop = eval(
                 f"__microopClassFromInsideTheAssembler({statement.params})",
@@ -166,9 +166,9 @@ def handle_statement(parser, container, statement):
     elif statement.is_directive:
         if statement.name not in container.directives.keys():
             raise Exception(f"Unrecognized directive: {statement.name}")
-        parser.symbols[
-            "__directiveFunctionFromInsideTheAssembler"
-        ] = container.directives[statement.name]
+        parser.symbols["__directiveFunctionFromInsideTheAssembler"] = (
+            container.directives[statement.name]
+        )
         try:
             eval(
                 f"__directiveFunctionFromInsideTheAssembler({statement.params})",
diff --git a/src/python/gem5/components/boards/abstract_system_board.py b/src/python/gem5/components/boards/abstract_system_board.py
index 8fe48920b5..a8765ee909 100644
--- a/src/python/gem5/components/boards/abstract_system_board.py
+++ b/src/python/gem5/components/boards/abstract_system_board.py
@@ -36,7 +36,6 @@ from .abstract_board import AbstractBoard
 
 
 class AbstractSystemBoard(System, AbstractBoard):
-
     """
     An abstract board for cases where boards should inherit from System.
     """
diff --git a/src/python/gem5/components/boards/test_board.py b/src/python/gem5/components/boards/test_board.py
index 2599c6853d..6acce79b1c 100644
--- a/src/python/gem5/components/boards/test_board.py
+++ b/src/python/gem5/components/boards/test_board.py
@@ -44,7 +44,6 @@ from .abstract_system_board import AbstractSystemBoard
 
 
 class TestBoard(AbstractSystemBoard):
-
     """This is a Testing Board used to run traffic generators on a simple
     architecture.
 
diff --git a/tests/gem5/stats/configs/pystat_vector2d_check.py b/tests/gem5/stats/configs/pystat_vector2d_check.py
index 617463e56f..909de12232 100644
--- a/tests/gem5/stats/configs/pystat_vector2d_check.py
+++ b/tests/gem5/stats/configs/pystat_vector2d_check.py
@@ -138,9 +138,11 @@ for x in range(args.num_vectors):
 
     vectors[x_index] = {
         "type": "Vector",
-        "description": stat_tester.subdescs[x]
-        if x in stat_tester.subdescs
-        else stat_tester.description,
+        "description": (
+            stat_tester.subdescs[x]
+            if x in stat_tester.subdescs
+            else stat_tester.description
+        ),
         "value": vector,
     }
 
diff --git a/util/minorview/model.py b/util/minorview/model.py
index 91979825c3..d84680fcd3 100644
--- a/util/minorview/model.py
+++ b/util/minorview/model.py
@@ -374,9 +374,9 @@ class TwoDColours(ColourPattern):
 
                 for index, value in parsed:
                     try:
-                        array[index % strips][
-                            index / strips
-                        ] = special_view_decoder(elemClass)(value)
+                        array[index % strips][index / strips] = (
+                            special_view_decoder(elemClass)(value)
+                        )
                     except:
                         print(
                             "Element out of range strips: %d,"
@@ -912,9 +912,9 @@ class BlobModel:
                         blobs = self.unitNameToBlobs.get(unit, [])
                         for blob in blobs:
                             if blob.visualDecoder is not None:
-                                event.visuals[
-                                    blob.picChar
-                                ] = blob.visualDecoder(pairs)
+                                event.visuals[blob.picChar] = (
+                                    blob.visualDecoder(pairs)
+                                )
 
                         self.add_unit_event(event)
                         last_time_lines[unit] = rest

From f03dddb458e5a9f0a0279ce053f478417a4666eb Mon Sep 17 00:00:00 2001
From: Jason Lowe-Power <jason@lowepower.com>
Date: Wed, 9 Oct 2024 13:21:28 -0700
Subject: [PATCH 22/47] Use board get_mem_ports consistently (#1509)

Previously, whether the board object or the memory_system returned
the memory ports was not consistent in the cache_hierarchies

This commit makes it consistently use the board. Note: the board
is a better place so it can customize the ports (e.g., add I/O
components or other things.

This commit also makes the arm board consistent with the other
boards and removes the specialized `get_mem_ports` that was not
used.
---
 src/python/gem5/components/boards/arm_board.py       | 12 ++++++++----
 .../components/cachehierarchies/classic/no_cache.py  |  2 +-
 .../classic/private_l1_cache_hierarchy.py            |  2 +-
 .../classic/private_l1_private_l2_cache_hierarchy.py |  2 +-
 .../classic/private_l1_shared_l2_cache_hierarchy.py  |  2 +-
 5 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/python/gem5/components/boards/arm_board.py b/src/python/gem5/components/boards/arm_board.py
index 0a0cd2fa28..c2739ebff7 100644
--- a/src/python/gem5/components/boards/arm_board.py
+++ b/src/python/gem5/components/boards/arm_board.py
@@ -274,11 +274,15 @@ class ArmBoard(ArmSystem, AbstractBoard, KernelDiskWorkload):
 
     @overrides(AbstractBoard)
     def get_mem_ports(self) -> Sequence[Tuple[AddrRange, Port]]:
-        all_ports = [
-            (self.realview.bootmem.range, self.realview.bootmem.port),
-        ] + self.get_memory().get_mem_ports()
+        # Note: Ruby needs to create a directory for the realview bootmem
+        if self.get_cache_hierarchy().is_ruby():
+            all_ports = [
+                (self.realview.bootmem.range, self.realview.bootmem.port),
+            ] + self.get_memory().get_mem_ports()
 
-        return all_ports
+            return all_ports
+
+        return super().get_mem_ports()
 
     @overrides(AbstractBoard)
     def has_io_bus(self) -> bool:
diff --git a/src/python/gem5/components/cachehierarchies/classic/no_cache.py b/src/python/gem5/components/cachehierarchies/classic/no_cache.py
index e6ec89b660..c3c791f4e0 100644
--- a/src/python/gem5/components/cachehierarchies/classic/no_cache.py
+++ b/src/python/gem5/components/cachehierarchies/classic/no_cache.py
@@ -124,7 +124,7 @@ class NoCache(AbstractClassicCacheHierarchy):
         # Set up the system port for functional access from the simulator.
         board.connect_system_port(self.membus.cpu_side_ports)
 
-        for _, port in board.get_memory().get_mem_ports():
+        for _, port in board.get_mem_ports():
             self.membus.mem_side_ports = port
 
     def _setup_coherent_io_bridge(self, board: AbstractBoard) -> None:
diff --git a/src/python/gem5/components/cachehierarchies/classic/private_l1_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/classic/private_l1_cache_hierarchy.py
index 8f63d3320f..9382d11036 100644
--- a/src/python/gem5/components/cachehierarchies/classic/private_l1_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/classic/private_l1_cache_hierarchy.py
@@ -96,7 +96,7 @@ class PrivateL1CacheHierarchy(AbstractClassicCacheHierarchy):
         # Set up the system port for functional access from the simulator.
         board.connect_system_port(self.membus.cpu_side_ports)
 
-        for _, port in board.get_memory().get_mem_ports():
+        for _, port in board.get_mem_ports():
             self.membus.mem_side_ports = port
 
         self.l1icaches = [
diff --git a/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_cache_hierarchy.py
index 049d0fb102..354d9d064d 100644
--- a/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_cache_hierarchy.py
@@ -126,7 +126,7 @@ class PrivateL1PrivateL2CacheHierarchy(
         # Set up the system port for functional access from the simulator.
         board.connect_system_port(self.membus.cpu_side_ports)
 
-        for _, port in board.get_memory().get_mem_ports():
+        for _, port in board.get_mem_ports():
             self.membus.mem_side_ports = port
 
         self.l2buses = [
diff --git a/src/python/gem5/components/cachehierarchies/classic/private_l1_shared_l2_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/classic/private_l1_shared_l2_cache_hierarchy.py
index 4a896b2292..1f0d62d541 100644
--- a/src/python/gem5/components/cachehierarchies/classic/private_l1_shared_l2_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/classic/private_l1_shared_l2_cache_hierarchy.py
@@ -119,7 +119,7 @@ class PrivateL1SharedL2CacheHierarchy(
         # Set up the system port for functional access from the simulator.
         board.connect_system_port(self.membus.cpu_side_ports)
 
-        for _, port in board.get_memory().get_mem_ports():
+        for _, port in board.get_mem_ports():
             self.membus.mem_side_ports = port
 
         self.l1icaches = [

From 34437880138d44fb5de3b10a7c65079a2831a89f Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Wed, 9 Oct 2024 14:46:54 -0700
Subject: [PATCH 23/47] misc: Add "src/python" to vscode Python Analysis Paths
 (#1647)

This allows vscode to resolve python imported from "src/python".
Warnings regarding these imports are numerous and the issue stops users
of vscode to utilizubg features like navigating the codebase though "Go
to Definition" queries on imported classes/functions.
---
 .vscode/settings.json | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000000..8f97d85829
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+    "python.analysis.extraPaths": [
+        "src/python"
+    ]
+}

From feeb3b2d672557b1470e02a502a5f334a795eaab Mon Sep 17 00:00:00 2001
From: "Erin (Jianghua) Le" <ejle@ucdavis.edu>
Date: Wed, 9 Oct 2024 19:49:43 -0700
Subject: [PATCH 24/47] cpu: fix simInsts and simOps not resetting (#1615)

This PR fixes the bug where simInsts and simOps don't reset when
m5.stats.reset() is called. The stats hostInstRate and hostOpRate are
affected by this change as well, as they depend on simInsts and simOps
respectively.

This is related to issue 1443 linked
[here](https://github.com/gem5/gem5/issues/1443).
---
 src/cpu/base.cc                   |  4 ++--
 src/cpu/base.hh                   | 28 ++++++++++++++++++++++++++--
 src/cpu/o3/probe/elastic_trace.cc |  2 +-
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index 94d1a6e8e3..cc093e7000 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -859,13 +859,13 @@ BaseCPU::GlobalStats::GlobalStats(statistics::Group *parent)
              "Simulator op (including micro ops) rate (op/s)")
 {
     simInsts
-        .functor(BaseCPU::numSimulatedInsts)
+        .functor(BaseCPU::GlobalStats::numSimulatedInsts)
         .precision(0)
         .prereq(simInsts)
         ;
 
     simOps
-        .functor(BaseCPU::numSimulatedOps)
+        .functor(BaseCPU::GlobalStats::numSimulatedOps)
         .precision(0)
         .prereq(simOps)
         ;
diff --git a/src/cpu/base.hh b/src/cpu/base.hh
index 0be0eda344..28cd90f3e2 100644
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -156,6 +156,30 @@ class BaseCPU : public ClockedObject
 
         statistics::Formula hostInstRate;
         statistics::Formula hostOpRate;
+
+        Counter previousInsts = 0;
+        Counter previousOps = 0;
+
+        static Counter
+        numSimulatedInsts()
+        {
+            return totalNumSimulatedInsts() - (globalStats->previousInsts);
+        }
+
+        static Counter
+        numSimulatedOps()
+        {
+            return totalNumSimulatedOps() - (globalStats->previousOps);
+        }
+
+        void
+        resetStats() override
+        {
+            previousInsts = totalNumSimulatedInsts();
+            previousOps = totalNumSimulatedOps();
+
+            statistics::Group::resetStats();
+        }
     };
 
     /**
@@ -609,7 +633,7 @@ class BaseCPU : public ClockedObject
 
     static int numSimulatedCPUs() { return cpuList.size(); }
     static Counter
-    numSimulatedInsts()
+    totalNumSimulatedInsts()
     {
         Counter total = 0;
 
@@ -621,7 +645,7 @@ class BaseCPU : public ClockedObject
     }
 
     static Counter
-    numSimulatedOps()
+    totalNumSimulatedOps()
     {
         Counter total = 0;
 
diff --git a/src/cpu/o3/probe/elastic_trace.cc b/src/cpu/o3/probe/elastic_trace.cc
index a56ef17749..2988e83038 100644
--- a/src/cpu/o3/probe/elastic_trace.cc
+++ b/src/cpu/o3/probe/elastic_trace.cc
@@ -122,7 +122,7 @@ ElasticTrace::regEtraceListeners()
 {
     assert(!allProbesReg);
     inform("@%llu: No. of instructions committed = %llu, registering elastic"
-        " probe listeners", curTick(), cpu->numSimulatedInsts());
+        " probe listeners", curTick(), cpu->totalNumSimulatedInsts());
     // Create new listeners: provide method to be called upon a notify() for
     // each probe point.
     listeners.push_back(new ProbeListenerArg<ElasticTrace, RequestPtr>(this,

From 50f652a2ee19c57f0d41541d6d8e73a59c5b62d8 Mon Sep 17 00:00:00 2001
From: Pranith <bobby.prani@gmail.com>
Date: Thu, 10 Oct 2024 09:05:22 -0700
Subject: [PATCH 25/47] Implement BTB using the cache library (#1537)

This enables the BTB to be associative and use various replacement
policies.
---
 configs/common/cores/arm/HPI.py        |   9 +
 configs/common/cores/arm/O3_ARM_v7a.py |   9 +
 configs/common/cores/arm/ex5_big.py    |   9 +
 src/cpu/pred/BranchPredictor.py        |  47 ++++
 src/cpu/pred/SConscript                |   2 +-
 src/cpu/pred/btb_entry.hh              | 288 +++++++++++++++++++++++++
 src/cpu/pred/simple_btb.cc             |  91 ++------
 src/cpu/pred/simple_btb.hh             |  72 +------
 8 files changed, 394 insertions(+), 133 deletions(-)
 create mode 100644 src/cpu/pred/btb_entry.hh

diff --git a/configs/common/cores/arm/HPI.py b/configs/common/cores/arm/HPI.py
index 826d4e19f4..36aa64eca5 100644
--- a/configs/common/cores/arm/HPI.py
+++ b/configs/common/cores/arm/HPI.py
@@ -1683,6 +1683,15 @@ class HPI_MMU(ArmMMU):
 class HPI_BTB(SimpleBTB):
     numEntries = 128
     tagBits = 18
+    associativity = 1
+    instShiftAmt = 2
+    btbReplPolicy = LRURP()
+    btbIndexingPolicy = BTBSetAssociative(
+        num_entries=Parent.numEntries,
+        set_shift=Parent.instShiftAmt,
+        assoc=Parent.associativity,
+        tag_bits=Parent.tagBits,
+    )
 
 
 class HPI_BP(TournamentBP):
diff --git a/configs/common/cores/arm/O3_ARM_v7a.py b/configs/common/cores/arm/O3_ARM_v7a.py
index 45bb391bb1..ee42c3c062 100644
--- a/configs/common/cores/arm/O3_ARM_v7a.py
+++ b/configs/common/cores/arm/O3_ARM_v7a.py
@@ -111,6 +111,15 @@ class O3_ARM_v7a_FUP(FUPool):
 class O3_ARM_v7a_BTB(SimpleBTB):
     numEntries = 2048
     tagBits = 18
+    associativity = 1
+    instShiftAmt = 2
+    btbReplPolicy = LRURP()
+    btbIndexingPolicy = BTBSetAssociative(
+        num_entries=Parent.numEntries,
+        set_shift=Parent.instShiftAmt,
+        assoc=Parent.associativity,
+        tag_bits=Parent.tagBits,
+    )
 
 
 # Bi-Mode Branch Predictor
diff --git a/configs/common/cores/arm/ex5_big.py b/configs/common/cores/arm/ex5_big.py
index f3b55fd3a8..8ea04aa5f7 100644
--- a/configs/common/cores/arm/ex5_big.py
+++ b/configs/common/cores/arm/ex5_big.py
@@ -108,6 +108,15 @@ class ex5_big_FUP(FUPool):
 class ex5_big_BTB(SimpleBTB):
     numEntries = 4096
     tagBits = 18
+    associativity = 1
+    instShiftAmt = 2
+    btbReplPolicy = LRURP()
+    btbIndexingPolicy = BTBSetAssociative(
+        num_entries=Parent.numEntries,
+        set_shift=Parent.instShiftAmt,
+        assoc=Parent.associativity,
+        tag_bits=Parent.tagBits,
+    )
 
 
 # Bi-Mode Branch Predictor
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index a10b2c2cef..5b90826315 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -38,6 +38,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.objects.ClockedObject import ClockedObject
+from m5.objects.IndexingPolicies import *
+from m5.objects.ReplacementPolicies import *
 from m5.params import *
 from m5.proxy import *
 from m5.SimObject import *
@@ -83,6 +85,38 @@ class BranchTargetBuffer(ClockedObject):
     numThreads = Param.Unsigned(Parent.numThreads, "Number of threads")
 
 
+class BTBIndexingPolicy(SimObject):
+    type = "BTBIndexingPolicy"
+    abstract = True
+    cxx_class = "gem5::IndexingPolicyTemplate<gem5::BTBTagType>"
+    cxx_header = "cpu/pred/btb_entry.hh"
+    cxx_template_params = ["class Types"]
+
+    # Get the associativity
+    assoc = Param.Int(Parent.assoc, "associativity")
+
+
+class BTBSetAssociative(BTBIndexingPolicy):
+    type = "BTBSetAssociative"
+    cxx_class = "gem5::BTBSetAssociative"
+    cxx_header = "cpu/pred/btb_entry.hh"
+
+    # Get the number of entries in the BTB from the parent
+    num_entries = Param.Unsigned(
+        Parent.numEntries, "Number of entries in the BTB"
+    )
+
+    # Set shift for the index. Ignore lower 2 bits for a 4 byte instruction.
+    set_shift = Param.Unsigned(2, "Number of bits to shift PC to get index")
+
+    # Total number of bits in the tag.
+    # This is above the index and offset bit
+    tag_bits = Param.Unsigned(64, "number of bits in the tag")
+
+    # Number of threads sharing the BTB
+    numThreads = Param.Unsigned(Parent.numThreads, "Number of threads")
+
+
 class SimpleBTB(BranchTargetBuffer):
     type = "SimpleBTB"
     cxx_class = "gem5::branch_prediction::SimpleBTB"
@@ -93,6 +127,19 @@ class SimpleBTB(BranchTargetBuffer):
     instShiftAmt = Param.Unsigned(
         Parent.instShiftAmt, "Number of bits to shift instructions by"
     )
+    associativity = Param.Unsigned(1, "BTB associativity")
+    btbReplPolicy = Param.BaseReplacementPolicy(
+        LRURP(), "BTB replacement policy"
+    )
+    btbIndexingPolicy = Param.BTBIndexingPolicy(
+        BTBSetAssociative(
+            assoc=Parent.associativity,
+            num_entries=Parent.numEntries,
+            set_shift=Parent.instShiftAmt,
+            numThreads=1,
+        ),
+        "BTB indexing policy",
+    )
 
 
 class IndirectPredictor(SimObject):
diff --git a/src/cpu/pred/SConscript b/src/cpu/pred/SConscript
index ec3102cada..6c03dd8a1b 100644
--- a/src/cpu/pred/SConscript
+++ b/src/cpu/pred/SConscript
@@ -45,7 +45,7 @@ SimObject('BranchPredictor.py',
     sim_objects=[
     'BranchPredictor',
     'IndirectPredictor', 'SimpleIndirectPredictor',
-    'BranchTargetBuffer', 'SimpleBTB',
+    'BranchTargetBuffer', 'SimpleBTB', 'BTBIndexingPolicy', 'BTBSetAssociative',
     'ReturnAddrStack',
     'LocalBP', 'TournamentBP', 'BiModeBP', 'TAGEBase', 'TAGE', 'LoopPredictor',
     'TAGE_SC_L_TAGE', 'TAGE_SC_L_TAGE_64KB', 'TAGE_SC_L_TAGE_8KB',
diff --git a/src/cpu/pred/btb_entry.hh b/src/cpu/pred/btb_entry.hh
new file mode 100644
index 0000000000..a445ac4775
--- /dev/null
+++ b/src/cpu/pred/btb_entry.hh
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2024 Pranith Kumar
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Declaration of a BTB entry and BTB indexing policy.
+ */
+
+#ifndef __CPU_PRED_BTB_ENTRY_HH__
+#define __CPU_PRED_BTB_ENTRY_HH__
+
+#include <vector>
+
+#include "arch/generic/pcstate.hh"
+#include "base/intmath.hh"
+#include "base/types.hh"
+#include "cpu/static_inst.hh"
+#include "mem/cache/replacement_policies/replaceable_entry.hh"
+#include "mem/cache/tags/indexing_policies/base.hh"
+#include "params/BTBIndexingPolicy.hh"
+#include "params/BTBSetAssociative.hh"
+
+namespace gem5 {
+
+class BTBTagType
+{
+  public:
+    struct KeyType
+    {
+        Addr address;
+        ThreadID tid;
+    };
+    using Params = BTBIndexingPolicyParams;
+};
+
+using BTBIndexingPolicy = IndexingPolicyTemplate<BTBTagType>;
+template class IndexingPolicyTemplate<BTBTagType>;
+
+class BTBSetAssociative : public BTBIndexingPolicy
+{
+  public:
+    PARAMS(BTBSetAssociative);
+    using KeyType = BTBTagType::KeyType;
+
+    BTBSetAssociative(const Params &p)
+        : BTBIndexingPolicy(p, p.num_entries, p.set_shift),
+          tagMask(mask(p.tag_bits))
+    {
+        setNumThreads(p.numThreads);
+    }
+
+  protected:
+    /**
+     * Extract the set index for the instruction PC based on tid.
+     */
+    uint32_t
+    extractSet(const KeyType &key) const
+    {
+        return ((key.address >> setShift)
+                ^ (key.tid << (tagShift - setShift - log2NumThreads)))
+            & setMask;
+    }
+
+  public:
+    /**
+     * Find all possible entries for insertion and replacement of an address.
+     */
+    std::vector<ReplaceableEntry*>
+    getPossibleEntries(const KeyType &key) const override
+    {
+        auto set_idx = extractSet(key);
+
+        assert(set_idx < sets.size());
+
+        return sets[set_idx];
+    }
+
+    /**
+     * Set number of threads sharing the BTB
+     */
+    void
+    setNumThreads(unsigned num_threads)
+    {
+        log2NumThreads = log2i(num_threads);
+    }
+
+    /**
+     * Generate the tag from the given address.
+     */
+    Addr
+    extractTag(const Addr addr) const override
+    {
+        return (addr >> tagShift) & tagMask;
+    }
+
+    Addr regenerateAddr(const KeyType &key,
+                        const ReplaceableEntry* entry) const override
+    {
+        panic("Not implemented!");
+        return 0;
+    }
+
+  private:
+    const uint64_t tagMask;
+    unsigned log2NumThreads;
+};
+
+namespace branch_prediction
+{
+
+class BTBEntry : public ReplaceableEntry
+{
+  public:
+    using IndexingPolicy = gem5::BTBIndexingPolicy;
+    using KeyType = gem5::BTBTagType::KeyType;
+    using TagExtractor = std::function<Addr(Addr)>;
+
+    /** Default constructor */
+    BTBEntry(TagExtractor ext)
+        : inst(nullptr), extractTag(ext), valid(false), tag({MaxAddr, -1})
+    {}
+
+    /** Update the target and instruction in the BTB entry.
+     *  During insertion, only the tag (key) is updated.
+     */
+    void
+    update(const PCStateBase &_target,
+           StaticInstPtr _inst)
+    {
+        set(target, _target);
+        inst = _inst;
+    }
+
+    /**
+     * Checks if the given tag information corresponds to this entry's.
+     */
+    bool
+    match(const KeyType &key) const
+    {
+        return isValid() && (tag.address == extractTag(key.address))
+            && (tag.tid == key.tid);
+    }
+
+    /**
+     * Insert the block by assigning it a tag and marking it valid. Touches
+     * block if it hadn't been touched previously.
+     */
+    void
+    insert(const KeyType &key)
+    {
+        setValid();
+        setTag({extractTag(key.address), key.tid});
+    }
+
+    /** Copy constructor */
+    BTBEntry(const BTBEntry &other)
+    {
+        valid      = other.valid;
+        tag        = other.tag;
+        inst       = other.inst;
+        extractTag = other.extractTag;
+        set(target, other.target);
+    }
+
+    /** Assignment operator */
+    BTBEntry& operator=(const BTBEntry &other)
+    {
+        valid      = other.valid;
+        tag        = other.tag;
+        inst       = other.inst;
+        extractTag = other.extractTag;
+        set(target, other.target);
+
+        return *this;
+    }
+
+    /**
+     * Checks if the entry is valid.
+     */
+    bool isValid() const { return valid; }
+
+    /**
+     * Get tag associated to this block.
+     */
+    KeyType getTag() const { return tag; }
+
+    /** Invalidate the block. Its contents are no longer valid. */
+    void
+    invalidate()
+    {
+        valid = false;
+        setTag({MaxAddr, -1});
+    }
+
+    /** The entry's target. */
+    std::unique_ptr<PCStateBase> target;
+
+    /** Pointer to the static branch inst at this address */
+    StaticInstPtr inst;
+
+    std::string
+    print() const override
+    {
+        return csprintf("tag: %#x tid: %d valid: %d | %s", tag.address, tag.tid,
+                        isValid(), ReplaceableEntry::print());
+    }
+
+  protected:
+    /**
+     * Set tag associated to this block.
+     */
+    void setTag(KeyType _tag) { tag = _tag; }
+
+    /** Set valid bit. The block must be invalid beforehand. */
+    void
+    setValid()
+    {
+        assert(!isValid());
+        valid = true;
+    }
+
+  private:
+    /** Callback used to extract the tag from the entry */
+    TagExtractor extractTag;
+
+    /**
+     * Valid bit. The contents of this entry are only valid if this bit is set.
+     * @sa invalidate()
+     * @sa insert()
+     */
+    bool valid;
+
+    /** The entry's tag. */
+    KeyType tag;
+};
+
+} // namespace gem5::branch_prediction
+/**
+ * This helper generates a tag extractor function object
+ * which will be typically used by Replaceable entries indexed
+ * with the BaseIndexingPolicy.
+ * It allows to "decouple" indexing from tagging. Those entries
+ * would call the functor without directly holding a pointer
+ * to the indexing policy which should reside in the cache.
+ */
+static constexpr auto
+genTagExtractor(BTBIndexingPolicy *ip)
+{
+    return [ip] (Addr addr) { return ip->extractTag(addr); };
+}
+
+}
+
+#endif //__CPU_PRED_BTB_ENTRY_HH__
diff --git a/src/cpu/pred/simple_btb.cc b/src/cpu/pred/simple_btb.cc
index c78caac7a8..0260ced8b3 100644
--- a/src/cpu/pred/simple_btb.cc
+++ b/src/cpu/pred/simple_btb.cc
@@ -44,84 +44,38 @@
 #include "base/trace.hh"
 #include "debug/BTB.hh"
 
-namespace gem5
-{
-
-namespace branch_prediction
+namespace gem5::branch_prediction
 {
 
 SimpleBTB::SimpleBTB(const SimpleBTBParams &p)
     : BranchTargetBuffer(p),
-        numEntries(p.numEntries),
-        tagBits(p.tagBits),
-        instShiftAmt(p.instShiftAmt),
-        log2NumThreads(floorLog2(p.numThreads))
+      btb("simpleBTB", p.numEntries, p.associativity,
+          p.btbReplPolicy, p.btbIndexingPolicy,
+          BTBEntry(genTagExtractor(p.btbIndexingPolicy)))
 {
     DPRINTF(BTB, "BTB: Creating BTB object.\n");
 
-    if (!isPowerOf2(numEntries)) {
+    if (!isPowerOf2(p.numEntries)) {
         fatal("BTB entries is not a power of 2!");
     }
-
-    btb.resize(numEntries);
-
-    for (unsigned i = 0; i < numEntries; ++i) {
-        btb[i].valid = false;
-    }
-
-    idxMask = numEntries - 1;
-
-    tagMask = (1 << tagBits) - 1;
-
-    tagShiftAmt = instShiftAmt + floorLog2(numEntries);
 }
 
 void
 SimpleBTB::memInvalidate()
 {
-    for (unsigned i = 0; i < numEntries; ++i) {
-        btb[i].valid = false;
-    }
+    btb.clear();
 }
 
-inline
-unsigned
-SimpleBTB::getIndex(Addr instPC, ThreadID tid)
-{
-    // Need to shift PC over by the word offset.
-    return ((instPC >> instShiftAmt)
-            ^ (tid << (tagShiftAmt - instShiftAmt - log2NumThreads)))
-            & idxMask;
-}
-
-inline
-Addr
-SimpleBTB::getTag(Addr instPC)
-{
-    return (instPC >> tagShiftAmt) & tagMask;
-}
-
-SimpleBTB::BTBEntry *
+BTBEntry *
 SimpleBTB::findEntry(Addr instPC, ThreadID tid)
 {
-    unsigned btb_idx = getIndex(instPC, tid);
-    Addr inst_tag = getTag(instPC);
-
-    assert(btb_idx < numEntries);
-
-    if (btb[btb_idx].valid
-        && inst_tag == btb[btb_idx].tag
-        && btb[btb_idx].tid == tid) {
-        return &btb[btb_idx];
-    }
-
-    return nullptr;
+    return btb.findEntry({instPC, tid});
 }
 
 bool
 SimpleBTB::valid(ThreadID tid, Addr instPC)
 {
-    BTBEntry *entry = findEntry(instPC, tid);
+    BTBEntry *entry = btb.findEntry({instPC, tid});
 
     return entry != nullptr;
 }
@@ -134,11 +88,12 @@ SimpleBTB::lookup(ThreadID tid, Addr instPC, BranchType type)
 {
     stats.lookups[type]++;
 
-    BTBEntry *entry = findEntry(instPC, tid);
+    BTBEntry *entry = btb.accessEntry({instPC, tid});
 
     if (entry) {
         return entry->target.get();
     }
+
     stats.misses[type]++;
     return nullptr;
 }
@@ -146,31 +101,27 @@ SimpleBTB::lookup(ThreadID tid, Addr instPC, BranchType type)
 const StaticInstPtr
 SimpleBTB::getInst(ThreadID tid, Addr instPC)
 {
-    BTBEntry *entry = findEntry(instPC, tid);
+    BTBEntry *entry = btb.findEntry({instPC, tid});
 
     if (entry) {
         return entry->inst;
     }
+
     return nullptr;
 }
 
 void
 SimpleBTB::update(ThreadID tid, Addr instPC,
-                    const PCStateBase &target,
-                    BranchType type, StaticInstPtr inst)
+                  const PCStateBase &target,
+                  BranchType type, StaticInstPtr inst)
 {
-    unsigned btb_idx = getIndex(instPC, tid);
-
-    assert(btb_idx < numEntries);
-
     stats.updates[type]++;
 
-    btb[btb_idx].tid = tid;
-    btb[btb_idx].valid = true;
-    set(btb[btb_idx].target, target);
-    btb[btb_idx].tag = getTag(instPC);
-    btb[btb_idx].inst = inst;
+    BTBEntry *victim = btb.findVictim({instPC, tid});
+
+    btb.insertEntry({instPC, tid}, victim);
+    victim->update(target, inst);
 }
 
-} // namespace branch_prediction
-} // namespace gem5
+
+} // namespace gem5::branch_prediction
diff --git a/src/cpu/pred/simple_btb.hh b/src/cpu/pred/simple_btb.hh
index 3c76890348..b1ef2a9fa5 100644
--- a/src/cpu/pred/simple_btb.hh
+++ b/src/cpu/pred/simple_btb.hh
@@ -41,15 +41,16 @@
 #ifndef __CPU_PRED_SIMPLE_BTB_HH__
 #define __CPU_PRED_SIMPLE_BTB_HH__
 
+#include "base/cache/associative_cache.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
 #include "cpu/pred/btb.hh"
+#include "cpu/pred/btb_entry.hh"
+#include "mem/cache/replacement_policies/replaceable_entry.hh"
+#include "mem/cache/tags/indexing_policies/base.hh"
 #include "params/SimpleBTB.hh"
 
-namespace gem5
-{
-
-namespace branch_prediction
+namespace gem5::branch_prediction
 {
 
 class SimpleBTB : public BranchTargetBuffer
@@ -60,44 +61,13 @@ class SimpleBTB : public BranchTargetBuffer
     void memInvalidate() override;
     bool valid(ThreadID tid, Addr instPC) override;
     const PCStateBase *lookup(ThreadID tid, Addr instPC,
-                           BranchType type = BranchType::NoBranch) override;
+                              BranchType type = BranchType::NoBranch) override;
     void update(ThreadID tid, Addr instPC, const PCStateBase &target_pc,
-                           BranchType type = BranchType::NoBranch,
-                           StaticInstPtr inst = nullptr) override;
+                BranchType type = BranchType::NoBranch,
+                StaticInstPtr inst = nullptr) override;
     const StaticInstPtr getInst(ThreadID tid, Addr instPC) override;
 
-
   private:
-    struct BTBEntry
-    {
-        /** The entry's tag. */
-        Addr tag = 0;
-
-        /** The entry's target. */
-        std::unique_ptr<PCStateBase> target;
-
-        /** The entry's thread id. */
-        ThreadID tid;
-
-        /** Whether or not the entry is valid. */
-        bool valid = false;
-
-        /** Pointer to the static branch instruction at this address */
-        StaticInstPtr inst = nullptr;
-    };
-
-
-    /** Returns the index into the BTB, based on the branch's PC.
-     *  @param inst_PC The branch to look up.
-     *  @return Returns the index into the BTB.
-     */
-    inline unsigned getIndex(Addr instPC, ThreadID tid);
-
-    /** Returns the tag bits of a given address.
-     *  @param inst_PC The branch's address.
-     *  @return Returns the tag bits.
-     */
-    inline Addr getTag(Addr instPC);
 
     /** Internal call to find an address in the BTB
      * @param instPC The branch's address.
@@ -106,31 +76,9 @@ class SimpleBTB : public BranchTargetBuffer
     BTBEntry *findEntry(Addr instPC, ThreadID tid);
 
     /** The actual BTB. */
-    std::vector<BTBEntry> btb;
-
-    /** The number of entries in the BTB. */
-    unsigned numEntries;
-
-    /** The index mask. */
-    unsigned idxMask;
-
-    /** The number of tag bits per entry. */
-    unsigned tagBits;
-
-    /** The tag mask. */
-    unsigned tagMask;
-
-    /** Number of bits to shift PC when calculating index. */
-    unsigned instShiftAmt;
-
-    /** Number of bits to shift PC when calculating tag. */
-    unsigned tagShiftAmt;
-
-    /** Log2 NumThreads used for hashing threadid */
-    unsigned log2NumThreads;
+    AssociativeCache<BTBEntry> btb;
 };
 
-} // namespace branch_prediction
-} // namespace gem5
+} // namespace gem5::branch_prediction
 
 #endif // __CPU_PRED_SIMPLE_BTB_HH__

From 3f42ab4ca915f8db0d929b032e0851b37e8256dc Mon Sep 17 00:00:00 2001
From: Jason Lowe-Power <jason@lowepower.com>
Date: Thu, 10 Oct 2024 09:53:40 -0700
Subject: [PATCH 26/47] stdlib,ruby: Enable resetting version numbers (#1649)

Ruby requires each machine type to have a continuous set of version
numbers starting at 0. We were hiding this from users/developers by
using a Python class variable in the stdlib. Unfortunately, with
multiple ruby systems this doesn't work anymore.

As a stop-gap this change adds "resetting" these versions to the
beginning of `incorporate_caches`. It would be better to fix this in the
C++ code (and assign these numbers in C++ probably via the RubySystem),
but that's a bigger change than is needed right now.

---------

Signed-off-by: Jason Lowe-Power <jason@lowepower.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../chi/private_l1_cache_hierarchy.py              |  8 ++++++++
 .../ruby/abstract_ruby_cache_hierarchy.py          | 12 ++++++++++++
 .../ruby/caches/prebuilt/octopi_cache/octopi.py    | 14 ++++++++++++++
 .../ruby/mesi_three_level_cache_hierarchy.py       | 10 ++++++++++
 .../ruby/mesi_two_level_cache_hierarchy.py         |  9 +++++++++
 .../ruby/mi_example_cache_hierarchy.py             |  8 ++++++++
 6 files changed, 61 insertions(+)

diff --git a/src/python/gem5/components/cachehierarchies/chi/private_l1_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/chi/private_l1_cache_hierarchy.py
index a469fead61..42c4e2258c 100644
--- a/src/python/gem5/components/cachehierarchies/chi/private_l1_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/chi/private_l1_cache_hierarchy.py
@@ -82,6 +82,7 @@ class PrivateL1CacheHierarchy(AbstractRubyCacheHierarchy):
 
     @overrides(AbstractCacheHierarchy)
     def incorporate_cache(self, board: AbstractBoard) -> None:
+        super().incorporate_cache(board)
         self.ruby_system = RubySystem()
 
         # Ruby's global network.
@@ -244,3 +245,10 @@ class PrivateL1CacheHierarchy(AbstractRubyCacheHierarchy):
             dma_controllers.append(ctrl)
 
         return dma_controllers
+
+    @overrides(AbstractRubyCacheHierarchy)
+    def _reset_version_numbers(self):
+        from .nodes.abstract_node import AbstractNode
+
+        AbstractNode._version = 0
+        MemoryController._version = 0
diff --git a/src/python/gem5/components/cachehierarchies/ruby/abstract_ruby_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/ruby/abstract_ruby_cache_hierarchy.py
index 3528b74495..6e7e957934 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/abstract_ruby_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/abstract_ruby_cache_hierarchy.py
@@ -37,6 +37,18 @@ class AbstractRubyCacheHierarchy(AbstractCacheHierarchy):
     def __init__(self):
         super().__init__()
 
+    def _reset_version_numbers(self):
+        """Needed for multiple ruby systems so that each system starts at 0.
+
+        Note: This needs to be overridden by the protocol since we don't know
+        the machine classes at this point.
+        """
+        raise NotImplementedError
+
+    @overrides(AbstractCacheHierarchy)
+    def incorporate_cache(self, board):
+        self._reset_version_numbers()
+
     @overrides(AbstractCacheHierarchy)
     def is_ruby(self) -> bool:
         return True
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/octopi.py b/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/octopi.py
index 83137ce15a..d576ae6ae4 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/octopi.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/octopi.py
@@ -38,6 +38,7 @@ from ......components.cachehierarchies.ruby.caches.mesi_three_level.directory im
 from ......components.cachehierarchies.ruby.caches.mesi_three_level.dma_controller import (
     DMAController,
 )
+from ......utils.override import overrides
 from ......utils.requires import requires
 from ....abstract_three_level_cache_hierarchy import (
     AbstractThreeLevelCacheHierarchy,
@@ -95,6 +96,7 @@ class OctopiCache(
         requires(
             coherence_protocol_required=CoherenceProtocol.MESI_THREE_LEVEL
         )
+        super().incorporate_cache(board)
 
         cache_line_size = board.get_cache_line_size()
 
@@ -267,3 +269,15 @@ class OctopiCache(
             ]
             for link in self.dma_int_links:
                 self.ruby_system.network._add_int_link(link)
+
+    @overrides(AbstractRubyCacheHierarchy)
+    def _reset_version_numbers(self):
+        from ....caches.mesi_three_level.l1_cache import L1Cache
+        from ....caches.mesi_three_level.l2_cache import L2Cache
+        from ....caches.mesi_three_level.l3_cache import L3Cache
+
+        Directory._version = 0
+        L1Cache._version = 0
+        L2Cache._version = 0
+        L3Cache._version = 0
+        DMAController._version = 0
diff --git a/src/python/gem5/components/cachehierarchies/ruby/mesi_three_level_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/ruby/mesi_three_level_cache_hierarchy.py
index 92e8860a24..501fbab081 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/mesi_three_level_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/mesi_three_level_cache_hierarchy.py
@@ -33,6 +33,7 @@ from m5.objects import (
 )
 
 from ....coherence_protocol import CoherenceProtocol
+from ....utils.override import overrides
 from ....utils.requires import requires
 
 requires(coherence_protocol_required=CoherenceProtocol.MESI_THREE_LEVEL)
@@ -87,6 +88,7 @@ class MESIThreeLevelCacheHierarchy(
         self._num_l3_banks = num_l3_banks
 
     def incorporate_cache(self, board: AbstractBoard) -> None:
+        super().incorporate_cache(board)
         cache_line_size = board.get_cache_line_size()
 
         self.ruby_system = RubySystem()
@@ -233,3 +235,11 @@ class MESIThreeLevelCacheHierarchy(
             ruby_system=self.ruby_system
         )
         board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)
+
+    @overrides(AbstractRubyCacheHierarchy)
+    def _reset_version_numbers(self):
+        Directory._version = 0
+        L1Cache._version = 0
+        L2Cache._version = 0
+        L3Cache._version = 0
+        DMAController._version = 0
diff --git a/src/python/gem5/components/cachehierarchies/ruby/mesi_two_level_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/ruby/mesi_two_level_cache_hierarchy.py
index efe714c23c..52a14c7681 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/mesi_two_level_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/mesi_two_level_cache_hierarchy.py
@@ -33,6 +33,7 @@ from m5.objects import (
 )
 
 from ....coherence_protocol import CoherenceProtocol
+from ....utils.override import overrides
 from ....utils.requires import requires
 
 requires(coherence_protocol_required=CoherenceProtocol.MESI_TWO_LEVEL)
@@ -83,6 +84,7 @@ class MESITwoLevelCacheHierarchy(
         self._num_l2_banks = num_l2_banks
 
     def incorporate_cache(self, board: AbstractBoard) -> None:
+        super().incorporate_cache(board)
         cache_line_size = board.get_cache_line_size()
 
         self.ruby_system = RubySystem()
@@ -199,3 +201,10 @@ class MESITwoLevelCacheHierarchy(
             ruby_system=self.ruby_system
         )
         board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)
+
+    @overrides(AbstractRubyCacheHierarchy)
+    def _reset_version_numbers(self):
+        Directory._version = 0
+        L1Cache._version = 0
+        L2Cache._version = 0
+        DMAController._version = 0
diff --git a/src/python/gem5/components/cachehierarchies/ruby/mi_example_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/ruby/mi_example_cache_hierarchy.py
index 56e620ff0c..271bc42536 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/mi_example_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/mi_example_cache_hierarchy.py
@@ -32,6 +32,7 @@ from m5.objects import (
 )
 
 from ....coherence_protocol import CoherenceProtocol
+from ....utils.override import overrides
 from ....utils.requires import requires
 
 requires(coherence_protocol_required=CoherenceProtocol.MI_EXAMPLE)
@@ -65,6 +66,7 @@ class MIExampleCacheHierarchy(AbstractRubyCacheHierarchy):
 
     @overrides(AbstractCacheHierarchy)
     def incorporate_cache(self, board: AbstractBoard) -> None:
+        super().incorporate_cache(board)
         self.ruby_system = RubySystem()
 
         # Ruby's global network.
@@ -176,3 +178,9 @@ class MIExampleCacheHierarchy(AbstractRubyCacheHierarchy):
             ruby_system=self.ruby_system
         )
         board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)
+
+    @overrides(AbstractRubyCacheHierarchy)
+    def _reset_version_numbers(self):
+        Directory._version = 0
+        L1Cache._version = 0
+        DMAController._version = 0

From c1c5147e530ce05e0f1c8f60dbb769c809f5194b Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Thu, 10 Oct 2024 10:13:56 -0700
Subject: [PATCH 27/47] tests,misc: Remove `edited` from PR Action trigger list
 (#1654)

`edited` is what forces a re-run of our tests when the PR title is
updated and other minor metadata stuff. I believe all changes to the
code are covered by the remainder. `synchronize` is means the PR is
triggered with the when the this PR is from (in this case my forked gem5
repo) is synced with the PR branch here. This covers the vast majority
of cases we care about. `opended` covers for the case where the PR is
created and `ready_for_review` for when something moves out of a draft.
---
 .github/workflows/ci-tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-tests.yaml b/.github/workflows/ci-tests.yaml
index 4a4ecb5ba4..f40f3faff9 100644
--- a/.github/workflows/ci-tests.yaml
+++ b/.github/workflows/ci-tests.yaml
@@ -5,7 +5,7 @@ name: CI Tests
 
 on:
     pull_request:
-        types: [opened, edited, synchronize, ready_for_review]
+        types: [opened, synchronize, ready_for_review]
 
 concurrency:
     group: ${{ github.workflow }}-${{ github.ref || github.run_id }}

From 6195b33960c4821cb21d671b32fa10bb7944557a Mon Sep 17 00:00:00 2001
From: "Erin (Jianghua) Le" <ejle@ucdavis.edu>
Date: Thu, 10 Oct 2024 10:17:03 -0700
Subject: [PATCH 28/47] util-docker,tests: Add compiler tests & Dockerfiles for
 GCC 14 (#1646)

This commit adds gcc 14 to the compiler tests and Dockerfiles.
---
 .github/workflows/compiler-tests.yaml    |  6 +++---
 util/dockerfiles/docker-bake.hcl         | 13 ++++++++++++-
 util/dockerfiles/gcc-compiler/Dockerfile |  2 +-
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/compiler-tests.yaml b/.github/workflows/compiler-tests.yaml
index eb570916bc..c44d2d9161 100644
--- a/.github/workflows/compiler-tests.yaml
+++ b/.github/workflows/compiler-tests.yaml
@@ -13,8 +13,8 @@ jobs:
         strategy:
             fail-fast: false
             matrix:
-                image: [gcc-version-13, gcc-version-12, gcc-version-11, gcc-version-10, clang-version-18, clang-version-17, clang-version-16, clang-version-15,
-                    clang-version-14, ubuntu-22.04_all-dependencies, ubuntu-24.04_all-dependencies, ubuntu-24.04_min-dependencies]
+                image: [gcc-version-14, gcc-version-13, gcc-version-12, gcc-version-11, gcc-version-10, clang-version-18, clang-version-17, clang-version-16,
+                    clang-version-15, clang-version-14, ubuntu-22.04_all-dependencies, ubuntu-24.04_all-dependencies, ubuntu-24.04_min-dependencies]
                 opts: [.opt, .fast]
         runs-on: [self-hosted, linux, x64]
         timeout-minutes: 2880 # 48 hours
@@ -32,7 +32,7 @@ jobs:
             matrix:
                 gem5-compilation: [ARM, ARM_MESI_Three_Level, ARM_MESI_Three_Level_HTM, ARM_MOESI_hammer, Garnet_standalone, MIPS, 'NULL', NULL_MESI_Two_Level,
                     NULL_MOESI_CMP_directory, NULL_MOESI_CMP_token, NULL_MOESI_hammer, POWER, RISCV, SPARC, X86, X86_MI_example, X86_MOESI_AMD_Base, VEGA_X86]
-                image: [gcc-version-13, clang-version-18]
+                image: [gcc-version-14, clang-version-18]
                 opts: [.opt]
         runs-on: [self-hosted, linux, x64]
         timeout-minutes: 2880 # 48 hours
diff --git a/util/dockerfiles/docker-bake.hcl b/util/dockerfiles/docker-bake.hcl
index 05f3b4c94b..3517894684 100644
--- a/util/dockerfiles/docker-bake.hcl
+++ b/util/dockerfiles/docker-bake.hcl
@@ -125,7 +125,8 @@ group "gcc-compilers" {
     "gcc-version-10",
     "gcc-version-11",
     "gcc-version-12",
-    "gcc-version-13"
+    "gcc-version-13",
+    "gcc-version-14"
   ]
 }
 
@@ -169,6 +170,16 @@ target "gcc-version-13" {
   tags = ["${IMAGE_URI}/gcc-version-13:${TAG}"]
 }
 
+target "gcc-version-14" {
+  inherits = ["common"]
+  annotations = ["index,manifest:org.opencontainers.image.description=An image with all dependencies for building gem5 with a GCC v14 compiler."]
+  args = {
+    version = "14"
+  }
+  context = "gcc-compiler"
+  tags = ["${IMAGE_URI}/gcc-version-14:${TAG}"]
+}
+
 group "ubuntu-releases" {
   targets=[
     "ubuntu-24-04_all-dependencies",
diff --git a/util/dockerfiles/gcc-compiler/Dockerfile b/util/dockerfiles/gcc-compiler/Dockerfile
index f36130ebff..8fd5032113 100644
--- a/util/dockerfiles/gcc-compiler/Dockerfile
+++ b/util/dockerfiles/gcc-compiler/Dockerfile
@@ -3,7 +3,7 @@ FROM  ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
 LABEL org.opencontainers.image.source=https://github.com/gem5/gem5
 LABEL org.opencontainers.image.licenses=BSD-3-Clause
 
-ARG version=13 # Version of GCC to install in this image. Default is 13.
+ARG version=14 # Version of GCC to install in this image. Default is 14.
 
 RUN apt -y update && \
     apt -y install gcc-${version} g++-${version} && \

From 65ba2dcae51c97010439d07b5d0618b13b83cc2a Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Thu, 10 Oct 2024 10:17:32 -0700
Subject: [PATCH 29/47] tests: Refactor downloading of pannotia tests (#1653)

With this patch the pannotia tests now:

1. Download the resources to 'gpu-pannotia' in the
'tests/gem5/resources' directory. This is where other test resources are
store.
2. Download thr USA-road-d.NY.gr dataset from Google cloud bucket in a
decompressed state.
2. Avoid re-download the resources if they are already present on the
host machine.
---
 tests/gem5/gpu/test_gpu_pannotia.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/gem5/gpu/test_gpu_pannotia.py b/tests/gem5/gpu/test_gpu_pannotia.py
index 6dc3fe5d1d..35275cf61d 100644
--- a/tests/gem5/gpu/test_gpu_pannotia.py
+++ b/tests/gem5/gpu/test_gpu_pannotia.py
@@ -27,11 +27,14 @@
 import gzip
 import os.path
 import shutil
+from pathlib import Path
 from urllib.request import urlretrieve
 
 from testlib import *
 
-resource_path = joinpath(absdirpath(__file__), "..", "gpu-pannotia-resources")
+resource_path = joinpath(
+    absdirpath(__file__), "..", "resources", "gpu-pannotia"
+)
 binary_path = joinpath(resource_path, "pannotia-bins")
 dataset_path = joinpath(resource_path, "pannotia-datasets")
 
@@ -57,7 +60,7 @@ dataset_links = {
     "G3_circuit.graph": "https://storage.googleapis.com/dist.gem5.org/dist/develop/datasets/pannotia/mis/G3_circuit.graph",
     "ecology1.graph": "https://storage.googleapis.com/dist.gem5.org/dist/develop/datasets/pannotia/mis/ecology1.graph",
     "coAuthorsDBLP.graph": "https://storage.googleapis.com/dist.gem5.org/dist/develop/datasets/pannotia/pagerank/coAuthorsDBLP.graph",
-    "USA-road-d.NY.gr.gz": "http://www.diag.uniroma1.it/challenge9/data/USA-road-d/USA-road-d.NY.gr.gz",
+    "USA-road-d.NY.gr": "https://storage.googleapis.com/dist.gem5.org/dist/develop/datasets/pannotia/USA-road-d/USA-road-d.NY.gr",
 }
 
 
@@ -66,15 +69,14 @@ if not os.path.isdir(resource_path):
     os.makedirs(dataset_path)
 
     for name in binary_links.keys():
+        if Path(f"{binary_path}/{name}").exists():
+            continue
         urlretrieve(binary_links[name], f"{binary_path}/{name}")
     for name in dataset_links.keys():
+        if Path(f"{dataset_path}/{name}").exists():
+            continue
         urlretrieve(dataset_links[name], f"{dataset_path}/{name}")
 
-    with gzip.open(f"{dataset_path}/USA-road-d.NY.gr.gz", "rb") as f_in:
-        with open(f"{dataset_path}/USA-road-d.NY.gr", "wb") as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    os.remove(f"{dataset_path}/USA-road-d.NY.gr.gz")
-
 if len(os.listdir(binary_path)) < len(binary_links):
     testlib.log.test_log.warn(
         "One or more binaries for the Pannotia GPU tests are missing! Try deleting gpu-pannotia-resources and rerunning."

From a8f88abfb12f772f496aad2b9866476f7ebb575c Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Thu, 10 Oct 2024 10:18:14 -0700
Subject: [PATCH 30/47] misc: Add 'ext' & 'tests' to vscode pythin extraPaths
 (#1652)

'ext' is set as a Python source path for gem5, like 'src/python'. It
helps vscode users to have vscode aware of this to better analytics and
reduce warnings (most comminly "unable to resolve import).

'tests' isn't in the Python source path when compiling gem5 but it is
when running `tests/main.py`. Though somewhat unideal as is lets vscode
think files in 'src' can import from files in 'test', adding this helps
vscode Python analytics parse the test files which reduces warnings and
aids in betters navigation of the testing code. This is particularly
helpful given the complexity of the testlib testing infrastructure.
---
 .vscode/settings.json | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 8f97d85829..9543f965b7 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,5 +1,7 @@
 {
     "python.analysis.extraPaths": [
-        "src/python"
+        "src/python",
+        "ext",
+        "tests"
     ]
 }

From a35f146ba28f5d7a53df95d09b5195c30c90b43f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sa=C3=BAl=20Adserias?=
 <33020671+saul44203@users.noreply.github.com>
Date: Tue, 2 Apr 2024 21:09:12 +0200
Subject: [PATCH 31/47] configs: add example RVV SE parametrized config

Change-Id: I0776c5751da8b80340166ab518593686d141a4dd
---
 .../example/gem5_library/riscv-rvv-example.py | 120 ++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100755 configs/example/gem5_library/riscv-rvv-example.py

diff --git a/configs/example/gem5_library/riscv-rvv-example.py b/configs/example/gem5_library/riscv-rvv-example.py
new file mode 100755
index 0000000000..57a6fd7afd
--- /dev/null
+++ b/configs/example/gem5_library/riscv-rvv-example.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2024 Barcelona Supercomputing Center
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+This script demonstrates how to run RISC-V vector-enabled binaries in SE mode
+with gem5. It accepts the number of CORES, VLEN, and ELEN as optional
+parameters, as well as the resource name to run. If no resource name is
+provided, a list of available resources will be displayed. If one is given the
+simulation will then execute the specified resource binary with the selected
+parameters until completion.
+
+
+Usage
+-----
+
+# Compile gem5 for RISC-V
+scons build/RISCV/gem5.opt
+
+# Run the simulation
+./build/RISCV/gem5.opt configs/example/gem5_library/riscv-rvv-example.py \
+    [-c CORES] [-v VLEN] [-e ELEN] <resource>
+
+"""
+
+import argparse
+
+from m5.objects import RiscvO3CPU
+
+from gem5.components.boards.simple_board import SimpleBoard
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
+    PrivateL1PrivateL2CacheHierarchy,
+)
+from gem5.components.memory import SingleChannelDDR3_1600
+from gem5.components.processors.base_cpu_core import BaseCPUCore
+from gem5.components.processors.base_cpu_processor import BaseCPUProcessor
+from gem5.isas import ISA
+from gem5.resources.resource import obtain_resource
+from gem5.simulate.simulator import Simulator
+from gem5.utils.requires import requires
+
+
+class RVVCore(BaseCPUCore):
+    def __init__(self, elen, vlen, cpu_id):
+        super().__init__(core=RiscvO3CPU(cpu_id=cpu_id), isa=ISA.RISCV)
+        self.core.isa[0].elen = elen
+        self.core.isa[0].vlen = vlen
+
+
+requires(isa_required=ISA.RISCV)
+
+resources = [
+    "rvv-branch",
+    "rvv-index",
+    "rvv-matmul",
+    "rvv-memcpy",
+    "rvv-reduce",
+    "rvv-saxpy",
+    "rvv-sgemm",
+    "rvv-strcmp",
+    "rvv-strcpy",
+    "rvv-strlen",
+    "rvv-strlen-fault",
+    "rvv-strncpy",
+]
+
+parser = argparse.ArgumentParser()
+parser.add_argument("resource", type=str, choices=resources)
+parser.add_argument("-c", "--cores", required=False, type=int, default=1)
+parser.add_argument("-v", "--vlen", required=False, type=int, default=256)
+parser.add_argument("-e", "--elen", required=False, type=int, default=64)
+
+args = parser.parse_args()
+
+cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+    l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB"
+)
+
+memory = SingleChannelDDR3_1600()
+
+processor = BaseCPUProcessor(
+    cores=[RVVCore(args.elen, args.vlen, i) for i in range(args.cores)]
+)
+
+board = SimpleBoard(
+    clk_freq="1GHz",
+    processor=processor,
+    memory=memory,
+    cache_hierarchy=cache_hierarchy,
+)
+
+binary = obtain_resource(args.resource)
+board.set_se_binary_workload(binary)
+
+simulator = Simulator(board=board, full_system=False)
+print("Beginning simulation!")
+simulator.run()

From f4ffe5f815eee8562f6a1c3e59466fc260285805 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sa=C3=BAl=20Adserias?=
 <33020671+saul44203@users.noreply.github.com>
Date: Wed, 3 Apr 2024 13:43:03 +0200
Subject: [PATCH 32/47] tests: add rvv-intrinsic-tests script and config

Change-Id: Ia3fa67bb2a2603dd5cbf665504f85a8b969c2a5e
---
 .../gem5/se_mode/rvv_intrinsic_tests/test.py  | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 tests/gem5/se_mode/rvv_intrinsic_tests/test.py

diff --git a/tests/gem5/se_mode/rvv_intrinsic_tests/test.py b/tests/gem5/se_mode/rvv_intrinsic_tests/test.py
new file mode 100644
index 0000000000..e20018ba60
--- /dev/null
+++ b/tests/gem5/se_mode/rvv_intrinsic_tests/test.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2024 Barcelona Supercomputing Center
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import re
+import sys
+
+from testlib import *
+
+resources = [
+    "rvv-branch",
+    "rvv-index",
+    "rvv-matmul",
+    "rvv-memcpy",
+    "rvv-reduce",
+    "rvv-saxpy",
+    "rvv-sgemm",
+    "rvv-strcmp",
+    "rvv-strcpy",
+    "rvv-strlen",
+    "rvv-strlen-fault",
+    "rvv-strncpy",
+]
+
+vlens = [2**x for x in range(7, 15)]
+
+for resource in resources:
+    out_verifier = verifier.MatchRegex(re.compile(f"^.*{resource}: pass$"))
+
+    for vlen in vlens:
+        gem5_verify_config(
+            name=f"test-riscv-{resource}-vlen_{vlen}-O3-se-mode",
+            fixtures=(),
+            verifiers=(out_verifier,),
+            config=f"{config.base_dir}/configs/example/gem5_library/riscv-rvv-example.py",
+            config_args=[resource, f"--vlen={vlen}"],
+            valid_isas=(constants.all_compiled_tag,),
+            length=constants.quick_tag,
+        )

From 1edeeda88156f552a7de482f3e1a04fcb9866332 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Mon, 14 Oct 2024 08:51:05 -0700
Subject: [PATCH 33/47] dev: Make unknown PCI device writes a warning (#1657)

This pops up in kernel 6.8.0. The device it is trying to write is
currently unknown but does not cause problems ignoring the device,
therefore change the panic to a warning and responding to the request
with the default PCI latency.

Change-Id: I4c1229753a75a94a255d8cfd411ac7311283366b
---
 src/dev/pci/host.cc | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/dev/pci/host.cc b/src/dev/pci/host.cc
index e7dea6c359..80cd9b5a5d 100644
--- a/src/dev/pci/host.cc
+++ b/src/dev/pci/host.cc
@@ -168,9 +168,14 @@ GenericPciHost::write(PacketPtr pkt)
             pkt->getSize());
 
     PciDevice *const pci_dev(getDevice(dev_addr.first));
-    panic_if(!pci_dev,
-             "%02x:%02x.%i: Write to config space on non-existent PCI device\n",
-             dev_addr.first.bus, dev_addr.first.dev, dev_addr.first.func);
+    warn_if(!pci_dev,
+            "%02x:%02x.%i: Write to config space on non-existent PCI device\n",
+            dev_addr.first.bus, dev_addr.first.dev, dev_addr.first.func);
+
+    if (!pci_dev) {
+        pkt->makeAtomicResponse();
+        return 20000; // 20ns default from PciDevice.py
+    }
 
     // @todo Remove this after testing
     pkt->headerDelay = pkt->payloadDelay = 0;

From 652a72d122ff2c9404c9122e98d035ed6efb7d36 Mon Sep 17 00:00:00 2001
From: Leon <541959102@qq.com>
Date: Tue, 15 Oct 2024 01:00:48 +0800
Subject: [PATCH 34/47] arch-riscv: Add support for riscv hardware probing
 syscall (#1525)

This PR adds the support for riscv hardware probing syscall described in
[this](https://docs.kernel.org/arch/riscv/hwprobe.html). The
implementation logic refers to [linux
kernel](https://github.com/torvalds/linux/blob/master/arch/riscv/kernel/sys_hwprobe.c)
and
[qemu](https://github.com/qemu/qemu/blob/master/linux-user/syscall.c).
And passed the [RISC-V hwprobe
exmaple](https://github.com/cyyself/hwprobe) test.

Hope to be merged. Thanks.

Change-Id: Iab714974f0551fc451e0d6846c75a7153809a308

Co-authored-by: Zhibo Hong <hongzhibo@bytedance.com>
---
 src/arch/riscv/linux/linux.hh       |  96 +++++++
 src/arch/riscv/linux/se_workload.cc | 386 ++++++++++++++++++++++++++++
 2 files changed, 482 insertions(+)

diff --git a/src/arch/riscv/linux/linux.hh b/src/arch/riscv/linux/linux.hh
index b2fbdd29f3..17281340d7 100644
--- a/src/arch/riscv/linux/linux.hh
+++ b/src/arch/riscv/linux/linux.hh
@@ -34,6 +34,7 @@
 #include "arch/riscv/utility.hh"
 #include "kern/linux/flag_tables.hh"
 #include "kern/linux/linux.hh"
+#include "base/bitfield.hh"
 
 namespace gem5
 {
@@ -42,6 +43,101 @@ class RiscvLinux : public Linux
 {
   public:
     static const ByteOrder byteOrder = ByteOrder::little;
+
+    enum RiscvHwprobeKey
+    {
+        Mvendorid,
+        Marchid,
+        Mimpid,
+        BaseBehavior,
+        IMAExt0,
+        Cpuperf0,
+        ZicbozBlockSize,
+        HighestVirtAddress,
+        TimeCsrFreq,
+        MisalignedScalarPerf
+    };
+
+    /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
+    #define RISCV_HWPROBE_MAX_KEY 9
+
+    BitUnion64(key_base_behavior_t)
+    Bitfield<0> ima;
+    EndBitUnion(key_base_behavior_t)
+
+    BitUnion64(key_ima_ext_0_t)
+        Bitfield<49> ZAWRS;
+        Bitfield<48> ZCMOP;
+        Bitfield<47> ZCF;
+        Bitfield<46> ZCD;
+        Bitfield<45> ZCB;
+        Bitfield<44> ZCA;
+        Bitfield<43> ZIMOP;
+        Bitfield<42> ZVE64D;
+        Bitfield<41> ZVE64F;
+        Bitfield<40> ZVE64X;
+        Bitfield<39> ZVE32F;
+        Bitfield<38> ZVE32X;
+        Bitfield<37> ZIHINTPAUSE;
+        Bitfield<36> ZICOND;
+        Bitfield<35> ZACAS;
+        Bitfield<34> ZTSO;
+        Bitfield<33> ZFA;
+        Bitfield<32> ZVFHMIN;
+        Bitfield<31> ZVFH;
+        Bitfield<30> ZIHINTNTL;
+        Bitfield<29> ZFHMIN;
+        Bitfield<28> ZFH;
+        Bitfield<27> ZVKT;
+        Bitfield<26> ZVKSH;
+        Bitfield<25> ZVKSED;
+        Bitfield<24> ZVKNHB;
+        Bitfield<22> ZVKNHA;
+        Bitfield<21> ZVKNED;
+        Bitfield<20> ZVKG;
+        Bitfield<19> ZVKB;
+        Bitfield<18> ZVBC;
+        Bitfield<17> ZVBB;
+        Bitfield<16> ZKT;
+        Bitfield<15> ZKSH;
+        Bitfield<14> ZKSED;
+        Bitfield<13> ZKNH;
+        Bitfield<12> ZKNE;
+        Bitfield<11> ZKND;
+        Bitfield<10> ZBKX;
+        Bitfield<9>  ZBKC;
+        Bitfield<8>  ZBKB;
+        Bitfield<7>  ZBC;
+        Bitfield<6>  ZICBOZ;
+        Bitfield<5>  ZBS;
+        Bitfield<4>  ZBB;
+        Bitfield<3>  ZBA;
+        Bitfield<2>  V;
+        Bitfield<1>  C;
+        Bitfield<0>  FD;
+    EndBitUnion(key_ima_ext_0_t)
+
+    enum MisalignedScalarPerf
+    {
+        Unknown,
+        Emulated,
+        Slow,
+        Fast,
+        Unsupported
+    };
+
+    /* Flags */
+    #define RISCV_HWPROBE_WHICH_CPUS	(1 << 0)
+
+    struct riscv_hwprobe {
+        int64_t  key;
+        uint64_t value;
+    };
+
+    typedef struct cpumask {
+        size_t size;
+        uint64_t bits[];
+    } cpumask_t;
 };
 
 class RiscvLinux64 : public RiscvLinux, public OpenFlagTable<RiscvLinux64>
diff --git a/src/arch/riscv/linux/se_workload.cc b/src/arch/riscv/linux/se_workload.cc
index 6caec283ed..d3015202b7 100644
--- a/src/arch/riscv/linux/se_workload.cc
+++ b/src/arch/riscv/linux/se_workload.cc
@@ -44,6 +44,8 @@
 #include <sys/syscall.h>
 
 #include "arch/riscv/process.hh"
+#include "arch/riscv/insts/static_inst.hh"
+#include "arch/riscv/regs/misc.hh"
 #include "base/loader/object_file.hh"
 #include "base/trace.hh"
 #include "cpu/thread_context.hh"
@@ -134,6 +136,388 @@ unameFunc32(SyscallDesc *desc, ThreadContext *tc, VPtr<Linux::utsname> name)
     return 0;
 }
 
+static inline void
+cpumask_set_cpu(unsigned int cpu, RiscvLinux::cpumask_t *dstp)
+{
+    assert(cpu < dstp->size * 8);
+    auto &bits = dstp->bits[cpu / sizeof(uint64_t)];
+    bits = insertBits(bits, cpu % sizeof(uint64_t), 1);
+}
+
+static inline void
+cpumask_clear_cpu(unsigned int cpu, RiscvLinux::cpumask_t *dstp)
+{
+    assert(cpu < dstp->size * 8);
+    auto &bits = dstp->bits[cpu / sizeof(uint64_t)];
+    bits = insertBits(bits, cpu % sizeof(uint64_t), 0);
+}
+
+static inline bool
+cpumask_test_cpu(unsigned int cpu, const RiscvLinux::cpumask_t *cpumask)
+{
+    assert(cpu < cpumask->size * 8);
+    return bits(cpumask->bits[cpu / sizeof(uint64_t)], cpu % sizeof(uint64_t)) != 0;
+}
+
+static inline void
+cpumask_and(RiscvLinux::cpumask_t *dstp, const RiscvLinux::cpumask_t *src1p,
+            const RiscvLinux::cpumask_t *src2p)
+{
+    assert(dstp->size == src1p->size);
+    assert(dstp->size == src2p->size);
+    for (size_t i = 0; i < dstp->size / sizeof(dstp->bits[0]); i++) {
+        dstp->bits[i] = src1p->bits[i] & src2p->bits[i];
+    }
+}
+
+static inline bool
+cpumask_empty(const RiscvLinux::cpumask_t *dstp)
+{
+    for (size_t i = 0; i < dstp->size / sizeof(dstp->bits[0]); i++) {
+        if (dstp->bits[i] != 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static inline void
+cpumask_copy(RiscvLinux::cpumask_t *dstp, const RiscvLinux::cpumask_t *srcp)
+{
+    assert(dstp->size == srcp->size);
+    memcpy(dstp->bits, srcp->bits, srcp->size);
+}
+
+static inline void
+cpumask_clear(RiscvLinux::cpumask_t *dstp)
+{
+    memset(dstp->bits, 0, dstp->size);
+}
+
+static inline RiscvLinux::cpumask_t *
+cpumask_malloc(ThreadContext *tc)
+{
+    RiscvLinux::cpumask_t *cpumask;
+
+    /* 8-bytes up-boundary alignment */
+    size_t size = (tc->getSystemPtr()->threads.size() + sizeof(cpumask->bits[0]) - 1) /
+                    sizeof(cpumask->bits[0]) * sizeof(cpumask->bits[0]);
+    cpumask = (RiscvLinux::cpumask_t *)malloc(sizeof(cpumask->size) + size);
+    if (cpumask != nullptr) {
+        cpumask->size = size;
+        cpumask_clear(cpumask);
+    }
+
+    return cpumask;
+}
+
+static inline void
+cpumask_free(RiscvLinux::cpumask_t *cpu_online_mask)
+{
+    free(cpu_online_mask);
+}
+
+static inline bool
+riscv_hwprobe_key_is_valid(int64_t key)
+{
+    return key >= 0 && key <= RISCV_HWPROBE_MAX_KEY;
+}
+
+static inline bool
+hwprobe_key_is_bitmask(int64_t key)
+{
+    switch (key) {
+    case RiscvLinux::BaseBehavior:
+    case RiscvLinux::IMAExt0:
+    case RiscvLinux::Cpuperf0:
+        return true;
+    }
+
+    return false;
+}
+
+static inline bool
+riscv_hwprobe_pair_cmp(RiscvLinux::riscv_hwprobe *pair,
+                       RiscvLinux::riscv_hwprobe *other_pair)
+{
+    if (pair->key != other_pair->key) {
+        return false;
+    }
+
+    if (hwprobe_key_is_bitmask(pair->key)) {
+        return (pair->value & other_pair->value) == other_pair->value;
+    }
+
+    return pair->value == other_pair->value;
+}
+
+static inline RiscvLinux::cpumask_t *
+get_cpu_online_mask(ThreadContext *tc)
+{
+    RiscvLinux::cpumask_t *cpu_online_mask = cpumask_malloc(tc);
+    if (cpu_online_mask != nullptr) {
+        for (int i = 0; i < tc->getSystemPtr()->threads.size(); i++) {
+            CPU_SET(i, (cpu_set_t *)&cpu_online_mask->bits);
+        }
+    }
+
+    return cpu_online_mask;
+}
+
+static void
+hwprobe_one_pair(ThreadContext *tc, RiscvLinux::riscv_hwprobe *pair,
+                 RiscvLinux::cpumask_t *cpus)
+{
+    switch (pair->key) {
+    case RiscvLinux::Mvendorid:
+        pair->value = tc->readMiscRegNoEffect(CSRData.at(CSR_MVENDORID).physIndex);
+        break;
+    case RiscvLinux::Marchid:
+        pair->value = tc->readMiscRegNoEffect(CSRData.at(CSR_MARCHID).physIndex);
+        break;
+    case RiscvLinux::Mimpid:
+        pair->value = tc->readMiscRegNoEffect(CSRData.at(CSR_MIMPID).physIndex);
+        break;
+    case RiscvLinux::BaseBehavior:
+        {
+            MISA misa = tc->readMiscRegNoEffect(MISCREG_ISA);
+            RiscvLinux::key_base_behavior_t *base_behavior =
+                (RiscvLinux::key_base_behavior_t *)&pair->value;
+            if (misa.rvi && misa.rvm && misa.rva) {
+                base_behavior->ima = 1;
+            }
+        }
+        break;
+    case RiscvLinux::IMAExt0:
+        {
+            MISA misa = tc->readMiscRegNoEffect(MISCREG_ISA);
+            RiscvLinux::key_ima_ext_0_t *ext = (RiscvLinux::key_ima_ext_0_t *)&pair->value;
+            if (misa.rvf && misa.rvd) ext->FD = 1;
+            if (misa.rvc) ext->C = 1;
+            if (misa.rvv) ext->V = 1;
+            ext->ZBA = 1;
+            ext->ZBB = 1;
+            ext->ZBS = 1;
+            ext->ZICBOZ = 1;
+            ext->ZBC = 1;
+            ext->ZBKB = 1;
+            ext->ZBKC = 1;
+            ext->ZBKX = 1;
+            ext->ZKND = 1;
+            ext->ZKNE = 1;
+            ext->ZKNH = 1;
+            ext->ZKSED = 1;
+            ext->ZKSH = 1;
+            ext->ZKT = 1;
+            ext->ZFH = 1;
+            ext->ZFHMIN = 1;
+            ext->ZVFH = 1;
+            ext->ZVFHMIN = 1;
+            ext->ZICOND = 1;
+            ext->ZVE64D = 1;
+            ext->ZCB = 1;
+            ext->ZCD = 1;
+            ext->ZCF = 1;
+        }
+        break;
+    case RiscvLinux::Cpuperf0:
+    case RiscvLinux::MisalignedScalarPerf:
+        pair->value = RiscvLinux::Slow;
+        break;
+    case RiscvLinux::ZicbozBlockSize:
+        pair->value = tc->getSystemPtr()->cacheLineSize();
+        break;
+    case RiscvLinux::HighestVirtAddress:
+        pair->value = tc->getProcessPtr()->memState->getMmapEnd();
+        break;
+
+    /*
+     * For forward compatibility, unknown keys don't fail the whole
+     * call, but get their element key set to -1 and value set to 0
+     * indicating they're unrecognized.
+     */
+    default:
+        pair->key = -1;
+        pair->value = 0;
+        break;
+    }
+}
+
+template <class OS>
+static int
+hwprobe_get_values(ThreadContext *tc, VPtr<> pairs, typename OS::size_t pair_count,
+                   typename OS::size_t cpusetsize, VPtr<> cpus_user, unsigned int flags)
+{
+    /* Check the reserved flags. */
+    if (flags != 0) {
+        return -EINVAL;
+    }
+
+    RiscvLinux::cpumask_t *cpu_online_mask = get_cpu_online_mask(tc);
+    if (cpu_online_mask == nullptr) {
+        return -ENOMEM;
+    }
+
+    RiscvLinux::cpumask_t *cpus = cpumask_malloc(tc);
+    if (cpus == nullptr) {
+        cpumask_free(cpu_online_mask);
+        return -ENOMEM;
+    }
+
+    if (cpusetsize > cpu_online_mask->size) {
+        cpusetsize = cpu_online_mask->size;
+    }
+
+    RiscvLinux::riscv_hwprobe *pair;
+    BufferArg pairs_buf(pairs, sizeof(RiscvLinux::riscv_hwprobe) * pair_count);
+
+    /*
+    * The interface supports taking in a CPU mask, and returns values that
+    * are consistent across that mask. Allow userspace to specify NULL and
+    * 0 as a shortcut to all online CPUs.
+    */
+    if (cpusetsize == 0 && !cpus_user) {
+        cpumask_copy(cpus, cpu_online_mask);
+        cpusetsize = cpu_online_mask->size;
+    } else {
+        BufferArg cpus_user_buf(cpus_user, cpusetsize);
+        cpus_user_buf.copyIn(SETranslatingPortProxy(tc));
+
+        cpu_online_mask->size = cpusetsize;
+        cpus->size = cpusetsize;
+        memcpy(cpus->bits, cpus_user_buf.bufferPtr(), cpusetsize);
+
+        /*
+        * Userspace must provide at least one online CPU, without that
+        * there's no way to define what is supported.
+        */
+        cpumask_and(cpus, cpus, cpu_online_mask);
+        if (cpumask_empty(cpus)) {
+            cpumask_free(cpu_online_mask);
+            cpumask_free(cpus);
+            return -EINVAL;
+        }
+    }
+
+    pairs_buf.copyIn(SETranslatingPortProxy(tc));
+    pair = (RiscvLinux::riscv_hwprobe *)pairs_buf.bufferPtr();
+
+    for (size_t i = 0; i < pair_count; i++, pair++) {
+        pair->value = 0;
+        hwprobe_one_pair(tc, pair, cpus);
+    }
+
+    pairs_buf.copyOut(SETranslatingPortProxy(tc));
+
+    cpumask_free(cpu_online_mask);
+    cpumask_free(cpus);
+
+    return 0;
+}
+
+template <class OS>
+static int
+hwprobe_get_cpus(ThreadContext *tc, VPtr<> pairs, typename OS::size_t pair_count,
+                 typename OS::size_t cpusetsize, VPtr<> cpus_user, unsigned int flags)
+{
+    if (flags != RISCV_HWPROBE_WHICH_CPUS) {
+        return -EINVAL;
+    }
+
+    if (cpusetsize == 0 || !cpus_user) {
+        return -EINVAL;
+    }
+
+    RiscvLinux::cpumask_t *cpu_online_mask = get_cpu_online_mask(tc);
+    if (cpu_online_mask == nullptr) {
+        return -ENOMEM;
+    }
+
+    RiscvLinux::cpumask_t *cpus = cpumask_malloc(tc);
+    if (cpus == nullptr) {
+        cpumask_free(cpu_online_mask);
+        return -ENOMEM;
+    }
+
+    RiscvLinux::cpumask_t *one_cpu = cpumask_malloc(tc);
+    if (one_cpu == nullptr) {
+        cpumask_free(cpu_online_mask);
+        cpumask_free(cpus);
+        return -ENOMEM;
+    }
+
+    if (cpusetsize > cpu_online_mask->size) {
+        cpusetsize = cpu_online_mask->size;
+    }
+
+    RiscvLinux::riscv_hwprobe *pair;
+    BufferArg cpus_user_buf(cpus_user, cpusetsize);
+    cpus_user_buf.copyIn(SETranslatingPortProxy(tc));
+    memcpy(cpus->bits, cpus_user_buf.bufferPtr(), cpusetsize);
+
+    if (cpumask_empty(cpus)) {
+        cpumask_copy(cpus, cpu_online_mask);
+        cpusetsize = cpu_online_mask->size;
+    }
+
+    cpumask_and(cpus, cpus, cpu_online_mask);
+
+    BufferArg pairs_buf(pairs, sizeof(RiscvLinux::riscv_hwprobe) * pair_count);
+    pairs_buf.copyIn(SETranslatingPortProxy(tc));
+    pair = (RiscvLinux::riscv_hwprobe *)pairs_buf.bufferPtr();
+
+    for (size_t i = 0; i < pair_count; i++, pair++) {
+        if (!riscv_hwprobe_key_is_valid(pair->key)) {
+            *pair = (RiscvLinux::riscv_hwprobe){ .key = -1, .value = 0 };
+            memset(cpus_user_buf.bufferPtr(), 0, cpusetsize);
+            break;
+        }
+
+        RiscvLinux::riscv_hwprobe tmp =
+            (RiscvLinux::riscv_hwprobe){ .key = pair->key, .value = 0 };
+
+        for (int cpu = 0; cpu < cpusetsize * 8; cpu++) {
+            if (!cpumask_test_cpu(cpu, cpus)) {
+                continue;
+            }
+
+            cpumask_set_cpu(cpu, one_cpu);
+
+            hwprobe_one_pair(tc, &tmp, one_cpu);
+
+            if (!riscv_hwprobe_pair_cmp(&tmp, pair)) {
+                cpumask_clear_cpu(cpu, cpus);
+            }
+
+            cpumask_clear_cpu(cpu, one_cpu);
+        }
+    }
+
+    pairs_buf.copyOut(SETranslatingPortProxy(tc));
+    cpus_user_buf.copyOut(SETranslatingPortProxy(tc));
+
+    cpumask_free(cpu_online_mask);
+    cpumask_free(cpus);
+    cpumask_free(one_cpu);
+
+    return 0;
+}
+
+template <class OS>
+static SyscallReturn
+riscvHWProbeFunc(SyscallDesc *desc, ThreadContext *tc, VPtr<> pairs,
+                 typename OS::size_t pair_count, typename OS::size_t cpusetsize,
+                 VPtr<> cpus_user, unsigned int flags)
+{
+    if (flags & RISCV_HWPROBE_WHICH_CPUS) {
+        return hwprobe_get_cpus<OS>(tc, pairs, pair_count, cpusetsize,
+                                    cpus_user, flags);
+    }
+
+    return hwprobe_get_values<OS>(tc, pairs, pair_count, cpusetsize,
+                                  cpus_user, flags);
+}
+
 SyscallDescTable<SEWorkload::SyscallABI64> EmuLinux::syscallDescs64 = {
     { 0,    "io_setup" },
     { 1,    "io_destroy" },
@@ -382,6 +766,7 @@ SyscallDescTable<SEWorkload::SyscallABI64> EmuLinux::syscallDescs64 = {
     { 241,  "perf_event_open" },
     { 242,  "accept4" },
     { 243,  "recvmmsg" },
+    { 258,  "riscv_hwprobe", riscvHWProbeFunc<RiscvLinux64> },
     { 260,  "wait4", wait4Func<RiscvLinux64> },
     { 261,  "prlimit64", prlimitFunc<RiscvLinux64> },
     { 262,  "fanotify_init" },
@@ -748,6 +1133,7 @@ SyscallDescTable<SEWorkload::SyscallABI32> EmuLinux::syscallDescs32 = {
     { 241,  "perf_event_open" },
     { 242,  "accept4" },
     { 243,  "recvmmsg" },
+    { 258,  "riscv_hwprobe", riscvHWProbeFunc<RiscvLinux32> },
     { 260,  "wait4", wait4Func<RiscvLinux32> },
     { 261,  "prlimit64", prlimitFunc<RiscvLinux32> },
     { 262,  "fanotify_init" },

From deb8f983a1cb8bd94885efa290f1f293eea63711 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Mon, 14 Oct 2024 10:19:52 -0700
Subject: [PATCH 35/47] arch-vega: Fix multi-dword setElem in PackedReg (#1664)

There are two issues related to setting an element in PackedReg where
the element spans multiple dwords. First, the mask value is wrong and is
clobbering both dwords. Second, a portion of the value is shifted out of
the narrower input type.

Fix this by using the correct mask to clear the bits where the value
will be placed and use a larger data type to shift the value into place.
---
 src/arch/amdgpu/vega/operand.hh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/arch/amdgpu/vega/operand.hh b/src/arch/amdgpu/vega/operand.hh
index 1bb9b43d1f..8e76405562 100644
--- a/src/arch/amdgpu/vega/operand.hh
+++ b/src/arch/amdgpu/vega/operand.hh
@@ -960,11 +960,14 @@ class PackedReg
         uint64_t elem_mask = (1ULL << ELEM_SIZE) - 1;
         value &= elem_mask;
 
+        // Clear the bits where the value goes so that operator| can be used.
         elem_mask <<= qw_lbit;
-        qword &= elem_mask;
+        qword &= ~elem_mask;
 
-        value <<= qw_lbit;
-        qword |= value;
+        // Promote to 64-bit to prevent shifting out of range
+        uint64_t value64 = value;
+        value64 <<= qw_lbit;
+        qword |= value64;
 
         dwords[udw] = uint32_t(qword >> 32);
         dwords[ldw] = uint32_t(qword & mask(32));

From aa782cffee6f48d56d1a65710380a03502390f91 Mon Sep 17 00:00:00 2001
From: Roger Chang <rogerycchang@google.com>
Date: Thu, 4 Jul 2024 16:02:49 +0800
Subject: [PATCH 36/47] arch-riscv: Add enable_Zcd options to RiscvISA

The Zcd instructions overlap the Zcmp and Zcmt instruction

This option is used to enable/disable Zcd extension, implies enable
Zcmp/Zcmt extension. If Zcd is enable, the Zcmp and Zcmt is disabled.
Otherwise, Zcmp and Zcmt is enabled.

Spec: https://github.com/riscv/riscv-isa-manual/blob/main/src/zc.adoc#zc-overview

Change-Id: I3788eb6539e13a210c9946efc43ca1fef4639560
---
 src/arch/riscv/RiscvISA.py       |   7 ++
 src/arch/riscv/decoder.cc        |   2 +
 src/arch/riscv/decoder.hh        |   1 +
 src/arch/riscv/isa.cc            |   2 +-
 src/arch/riscv/isa.hh            |  10 +++
 src/arch/riscv/isa/bitfields.isa |   1 +
 src/arch/riscv/isa/decoder.isa   | 114 +++++++++++++++++--------------
 src/arch/riscv/types.hh          |   1 +
 8 files changed, 84 insertions(+), 54 deletions(-)

diff --git a/src/arch/riscv/RiscvISA.py b/src/arch/riscv/RiscvISA.py
index f87941d413..05854f48c5 100644
--- a/src/arch/riscv/RiscvISA.py
+++ b/src/arch/riscv/RiscvISA.py
@@ -114,6 +114,13 @@ class RiscvISA(BaseISA):
 
     enable_Zicbom_fs = Param.Bool(True, "Enable Zicbom extension in FS mode")
     enable_Zicboz_fs = Param.Bool(True, "Enable Zicboz extension in FS mode")
+    enable_Zcd = Param.Bool(
+        True,
+        "Enable Zcd extensions. "
+        "Set the option to false implies the Zcmp and Zcmt is enable as "
+        "c.fsdsp is overlap with them."
+        "Refs: https://github.com/riscv/riscv-isa-manual/blob/main/src/zc.adoc",
+    )
 
     wfi_resume_on_pending = Param.Bool(
         False,
diff --git a/src/arch/riscv/decoder.cc b/src/arch/riscv/decoder.cc
index ee5d313587..557be1cbef 100644
--- a/src/arch/riscv/decoder.cc
+++ b/src/arch/riscv/decoder.cc
@@ -44,6 +44,7 @@ Decoder::Decoder(const RiscvDecoderParams &p) : InstDecoder(p, &machInst)
     ISA *isa = dynamic_cast<ISA*>(p.isa);
     vlen = isa->getVecLenInBits();
     elen = isa->getVecElemLenInBits();
+    _enableZcd = isa->enableZcd();
     reset();
 }
 
@@ -127,6 +128,7 @@ Decoder::decode(PCStateBase &_next_pc)
     emi.vtype8  = next_pc.vtype() & 0xff;
     emi.vill    = next_pc.vtype().vill;
     emi.rv_type = static_cast<int>(next_pc.rvType());
+    emi.enable_zcd = _enableZcd;
 
     return decode(emi, next_pc.instAddr());
 }
diff --git a/src/arch/riscv/decoder.hh b/src/arch/riscv/decoder.hh
index bf863fda22..d44455cd0b 100644
--- a/src/arch/riscv/decoder.hh
+++ b/src/arch/riscv/decoder.hh
@@ -62,6 +62,7 @@ class Decoder : public InstDecoder
 
     uint32_t vlen;
     uint32_t elen;
+    bool _enableZcd;
 
     virtual StaticInstPtr decodeInst(ExtMachInst mach_inst);
 
diff --git a/src/arch/riscv/isa.cc b/src/arch/riscv/isa.cc
index bcc22d7cb0..7f4d97f4e9 100644
--- a/src/arch/riscv/isa.cc
+++ b/src/arch/riscv/isa.cc
@@ -260,7 +260,7 @@ RegClass ccRegClass(CCRegClass, CCRegClassName, 0, debug::IntRegs);
 ISA::ISA(const Params &p) : BaseISA(p, "riscv"),
     _rvType(p.riscv_type), enableRvv(p.enable_rvv), vlen(p.vlen), elen(p.elen),
     _privilegeModeSet(p.privilege_mode_set),
-    _wfiResumeOnPending(p.wfi_resume_on_pending)
+    _wfiResumeOnPending(p.wfi_resume_on_pending), _enableZcd(p.enable_Zcd)
 {
     _regClasses.push_back(&intRegClass);
     _regClasses.push_back(&floatRegClass);
diff --git a/src/arch/riscv/isa.hh b/src/arch/riscv/isa.hh
index 29a75854c7..cda2df41e6 100644
--- a/src/arch/riscv/isa.hh
+++ b/src/arch/riscv/isa.hh
@@ -108,6 +108,14 @@ class ISA : public BaseISA
     */
     const bool _wfiResumeOnPending;
 
+    /**
+     * Enable Zcd extensions.
+     * Set the option to false implies the Zcmp and Zcmt is enable as c.fsdsp
+     * is overlap with them.
+     * Refs: https://github.com/riscv/riscv-isa-manual/blob/main/src/zc.adoc
+     */
+    bool _enableZcd;
+
   public:
     using Params = RiscvISAParams;
 
@@ -184,6 +192,8 @@ class ISA : public BaseISA
 
     bool resumeOnPending() { return _wfiResumeOnPending; }
 
+    bool enableZcd() { return _enableZcd; }
+
     virtual Addr getFaultHandlerAddr(
         RegIndex idx, uint64_t cause, bool intr) const;
 };
diff --git a/src/arch/riscv/isa/bitfields.isa b/src/arch/riscv/isa/bitfields.isa
index 0b4fae7b82..4589184e68 100644
--- a/src/arch/riscv/isa/bitfields.isa
+++ b/src/arch/riscv/isa/bitfields.isa
@@ -34,6 +34,7 @@
 // Bitfield definitions.
 //
 def bitfield RVTYPE rv_type;
+def bitfield ENABLE_ZCD enable_zcd;
 
 def bitfield QUADRANT <1:0>;
 def bitfield OPCODE5 <6:2>;
diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index c1dc790f26..db0e60cc77 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -54,23 +54,25 @@ decode QUADRANT default Unknown::unknown() {
             Rp2 = rvSext(sp + imm);
         }}, uint64_t);
         format CompressedLoad {
-            0x1: c_fld({{
-                offset = CIMM3 << 3 | CIMM2 << 6;
-            }}, {{
-                STATUS status = xc->readMiscReg(MISCREG_STATUS);
-                if (status.fs == FPUStatus::OFF)
-                    return std::make_shared<IllegalInstFault>("FPU is off",
-                                                               machInst);
+            0x1: decode ENABLE_ZCD {
+                0x1: c_fld({{
+                    offset = CIMM3 << 3 | CIMM2 << 6;
+                }}, {{
+                    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+                    if (status.fs == FPUStatus::OFF)
+                        return std::make_shared<IllegalInstFault>("FPU is off",
+                                                                   machInst);
 
-                // Mutating any floating point register changes the FS bit
-                // of the STATUS CSR.
-                status.fs = FPUStatus::DIRTY;
-                xc->setMiscReg(MISCREG_STATUS, status);
+                    // Mutating any floating point register changes the FS bit
+                    // of the STATUS CSR.
+                    status.fs = FPUStatus::DIRTY;
+                    xc->setMiscReg(MISCREG_STATUS, status);
 
-                Fp2_bits = Mem;
-            }}, {{
-                EA = rvSext(Rp1 + offset);
-            }});
+                    Fp2_bits = Mem;
+                }}, {{
+                    EA = rvSext(Rp1 + offset);
+                }});
+            }
             0x2: c_lw({{
                 offset = CIMM2<1:1> << 2 |
                          CIMM3 << 3 |
@@ -152,18 +154,20 @@ decode QUADRANT default Unknown::unknown() {
             }
         }
         format CompressedStore {
-            0x5: c_fsd({{
-                offset = CIMM3 << 3 | CIMM2 << 6;
-            }}, {{
-                STATUS status = xc->readMiscReg(MISCREG_STATUS);
-                if (status.fs == FPUStatus::OFF)
-                    return std::make_shared<IllegalInstFault>("FPU is off",
-                                                               machInst);
+            0x5: decode ENABLE_ZCD {
+                0x1: c_fsd({{
+                    offset = CIMM3 << 3 | CIMM2 << 6;
+                }}, {{
+                    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+                    if (status.fs == FPUStatus::OFF)
+                        return std::make_shared<IllegalInstFault>("FPU is off",
+                                                                   machInst);
 
-                Mem = Fp2_bits;
-            }}, {{
-                EA = rvSext(Rp1 + offset);
-            }});
+                    Mem = Fp2_bits;
+                }}, {{
+                    EA = rvSext(Rp1 + offset);
+                }});
+            }
             0x6: c_sw({{
                 offset = CIMM2<1:1> << 2 |
                          CIMM3 << 3 |
@@ -381,23 +385,25 @@ decode QUADRANT default Unknown::unknown() {
             Rc1 = rvSext(Rc1 << imm);
         }}, uint64_t);
         format CompressedLoad {
-            0x1: c_fldsp({{
-                offset = CIMM5<4:3> << 3 |
-                         CIMM1 << 5 |
-                         CIMM5<2:0> << 6;
-            }}, {{
-                STATUS status = xc->readMiscReg(MISCREG_STATUS);
-                if (status.fs == FPUStatus::OFF)
-                    return std::make_shared<IllegalInstFault>("FPU is off",
-                                                               machInst);
+            0x1: decode ENABLE_ZCD {
+                0x1: c_fldsp({{
+                    offset = CIMM5<4:3> << 3 |
+                             CIMM1 << 5 |
+                             CIMM5<2:0> << 6;
+                }}, {{
+                    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+                    if (status.fs == FPUStatus::OFF)
+                        return std::make_shared<IllegalInstFault>("FPU is off",
+                                                                   machInst);
 
-                status.fs = FPUStatus::DIRTY;
-                xc->setMiscReg(MISCREG_STATUS, status);
+                    status.fs = FPUStatus::DIRTY;
+                    xc->setMiscReg(MISCREG_STATUS, status);
 
-                Fc1_bits = Mem;
-            }}, {{
-                EA = rvSext(sp + offset);
-            }});
+                    Fc1_bits = Mem;
+                }}, {{
+                    EA = rvSext(sp + offset);
+                }});
+            }
             0x2: c_lwsp({{
                 offset = CIMM5<4:2> << 2 |
                          CIMM1 << 5 |
@@ -480,19 +486,21 @@ decode QUADRANT default Unknown::unknown() {
             }
         }
         format CompressedStore {
-            0x5: c_fsdsp({{
-                offset = CIMM6<5:3> << 3 |
-                         CIMM6<2:0> << 6;
-            }}, {{
-                STATUS status = xc->readMiscReg(MISCREG_STATUS);
-                if (status.fs == FPUStatus::OFF)
-                    return std::make_shared<IllegalInstFault>("FPU is off",
-                                                               machInst);
+            0x5: decode ENABLE_ZCD {
+                0x1: c_fsdsp({{
+                    offset = CIMM6<5:3> << 3 |
+                             CIMM6<2:0> << 6;
+                }}, {{
+                    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+                    if (status.fs == FPUStatus::OFF)
+                        return std::make_shared<IllegalInstFault>("FPU is off",
+                                                                   machInst);
 
-                Mem_ud = Fc2_bits;
-            }}, {{
-                EA = rvSext(sp + offset);
-            }});
+                    Mem_ud = Fc2_bits;
+                }}, {{
+                    EA = rvSext(sp + offset);
+                }});
+            }
             0x6: c_swsp({{
                 offset = CIMM6<5:2> << 2 |
                          CIMM6<1:0> << 6;
diff --git a/src/arch/riscv/types.hh b/src/arch/riscv/types.hh
index c7edffc2f7..4bd3168804 100644
--- a/src/arch/riscv/types.hh
+++ b/src/arch/riscv/types.hh
@@ -58,6 +58,7 @@ BitUnion64(ExtMachInst)
     // Decoder state
     Bitfield<63, 62>    rv_type;
     Bitfield<61>        compressed;
+    Bitfield<60>        enable_zcd;
     // More bits for vector extension
     Bitfield<57, 41>    vl;     // [0, 2**16]
     Bitfield<40>        vill;

From 28b112e2a6939caec206dd4d262faaaeb833f4f0 Mon Sep 17 00:00:00 2001
From: Roger Chang <rogerycchang@google.com>
Date: Thu, 4 Jul 2024 16:02:49 +0800
Subject: [PATCH 37/47] arch-riscv: Implement Zcmp

Implement instructions:
cm.push
cm.pop
cm.popret
cm.popretz
cm.mva01s
cm.mvsa01

Spec: https://github.com/riscv/riscv-isa-manual/blob/main/src/zc.adoc#zcmp

Change-Id: I2921c4bdb0c654858a237386056ebb2aed643a5a
---
 src/arch/riscv/insts/SConscript        |   1 +
 src/arch/riscv/insts/zcmp.cc           | 130 +++++
 src/arch/riscv/insts/zcmp.hh           |  60 ++
 src/arch/riscv/isa/bitfields.isa       |   3 +
 src/arch/riscv/isa/decoder.isa         |  14 +
 src/arch/riscv/isa/formats/formats.isa |   1 +
 src/arch/riscv/isa/formats/zcmp.isa    | 772 +++++++++++++++++++++++++
 src/arch/riscv/isa/includes.isa        |   1 +
 src/arch/riscv/isa/operands.isa        |   4 +
 src/arch/riscv/regs/int.hh             |  12 +
 src/arch/riscv/types.hh                |   4 +
 11 files changed, 1002 insertions(+)
 create mode 100644 src/arch/riscv/insts/zcmp.cc
 create mode 100644 src/arch/riscv/insts/zcmp.hh
 create mode 100644 src/arch/riscv/isa/formats/zcmp.isa

diff --git a/src/arch/riscv/insts/SConscript b/src/arch/riscv/insts/SConscript
index 2822cf86b4..2519b3e07d 100644
--- a/src/arch/riscv/insts/SConscript
+++ b/src/arch/riscv/insts/SConscript
@@ -34,3 +34,4 @@ Source('mem.cc', tags='riscv isa')
 Source('standard.cc', tags='riscv isa')
 Source('static_inst.cc', tags='riscv isa')
 Source('vector.cc', tags='riscv isa')
+Source('zcmp.cc', tags='riscv isa')
diff --git a/src/arch/riscv/insts/zcmp.cc b/src/arch/riscv/insts/zcmp.cc
new file mode 100644
index 0000000000..018ea45a60
--- /dev/null
+++ b/src/arch/riscv/insts/zcmp.cc
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 Google LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/riscv/insts/zcmp.hh"
+
+#include <string>
+
+#include "arch/riscv/regs/int.hh"
+#include "arch/riscv/utility.hh"
+
+namespace gem5
+{
+
+namespace RiscvISA
+{
+
+CmMacroInst::CmMacroInst(
+    const char* mnem, ExtMachInst machInst, OpClass opClass)
+    : RiscvMacroInst(mnem, machInst, opClass), rlist(machInst.rlist)
+{
+}
+
+// Ref: https://github.com/riscv-software-src/riscv-isa-sim/blob/f7d0dba60/
+//      riscv/decode.h#L168
+uint64_t
+CmMacroInst::stackAdj() const
+{
+    uint64_t stack_adj_base = 0;
+    switch (machInst.rlist) {
+      case 15:
+        stack_adj_base += 16;
+        [[fallthrough]];
+      case 14:
+        if (machInst.rv_type == RV64) {
+            stack_adj_base += 16;
+        }
+        [[fallthrough]];
+      case 13:
+      case 12:
+        stack_adj_base += 16;
+        [[fallthrough]];
+      case 11:
+      case 10:
+        if (machInst.rv_type == RV64) {
+            stack_adj_base += 16;
+        }
+        [[fallthrough]];
+      case 9:
+      case 8:
+        stack_adj_base += 16;
+        [[fallthrough]];
+      case 7:
+      case 6:
+        if (machInst.rv_type == RV64) {
+            stack_adj_base += 16;
+        }
+        [[fallthrough]];
+      case 5:
+      case 4:
+        stack_adj_base += 16;
+        break;
+    }
+
+    return stack_adj_base + machInst.spimm * 16;
+}
+
+std::string
+CmMacroInst::getRlistStr() const
+{
+    std::string s = "";
+    switch (machInst.rlist) {
+      case 15:
+        s = csprintf("{%s, %s-%s}", registerName(ReturnAddrReg),
+                     registerName(int_reg::S0),
+                     registerName(PushPopRegList[0]));
+        break;
+      case 14:
+      case 13:
+      case 12:
+      case 11:
+      case 10:
+      case 9:
+      case 8:
+      case 7:
+      case 6:
+        s = csprintf("{%s, %s-%s}", registerName(ReturnAddrReg),
+                     registerName(int_reg::S0),
+                     registerName(PushPopRegList[16-machInst.rlist]));
+        break;
+      case 5:
+        s = csprintf("{%s, %s}", registerName(ReturnAddrReg),
+                     registerName(int_reg::S0));
+        break;
+      case 4:
+        s = csprintf("{%s}", registerName(ReturnAddrReg));
+        break;
+      default:
+        break;
+    }
+
+    return s;
+}
+
+} // namespace RiscvISA
+} // namespace gem5
diff --git a/src/arch/riscv/insts/zcmp.hh b/src/arch/riscv/insts/zcmp.hh
new file mode 100644
index 0000000000..5f0d734b10
--- /dev/null
+++ b/src/arch/riscv/insts/zcmp.hh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 Google LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_RISCV_INSTS_ZCMP_HH__
+#define __ARCH_RISCV_INSTS_ZCMP_HH__
+
+#include <string>
+
+#include "arch/riscv/insts/static_inst.hh"
+#include "cpu/static_inst.hh"
+
+namespace gem5
+{
+
+namespace RiscvISA
+{
+
+class CmMacroInst : public RiscvMacroInst
+{
+  public:
+    CmMacroInst(const char* mnem, ExtMachInst machInst, OpClass opClass);
+
+  protected:
+    using RiscvMacroInst::RiscvMacroInst;
+
+    uint64_t stackAdj() const;
+    std::string getRlistStr() const;
+
+    uint64_t rlist;
+};
+
+} // namespace RiscvISA
+} // namespace gem5
+
+#endif // __ARCH_RISCV_INSTS_ZCMP_HH__
diff --git a/src/arch/riscv/isa/bitfields.isa b/src/arch/riscv/isa/bitfields.isa
index 4589184e68..5fc624acc1 100644
--- a/src/arch/riscv/isa/bitfields.isa
+++ b/src/arch/riscv/isa/bitfields.isa
@@ -104,10 +104,13 @@ def bitfield CFUNCT1 <12>;
 def bitfield CFUNCT1BIT6 <6>;
 def bitfield CFUNCT2HIGH <11:10>;
 def bitfield CFUNCT2LOW <6:5>;
+def bitfield CFUNCT2MID <9:8>;
 def bitfield RC1 <11:7>;
 def bitfield RC2 <6:2>;
 def bitfield RP1 <9:7>;
 def bitfield RP2 <4:2>;
+def bitfield R1S <9:7>;
+def bitfield R2S <4:2>;
 def bitfield FC1 <11:7>;
 def bitfield FC2 <6:2>;
 def bitfield FP2 <4:2>;
diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index db0e60cc77..90efb8ad82 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -487,6 +487,20 @@ decode QUADRANT default Unknown::unknown() {
         }
         format CompressedStore {
             0x5: decode ENABLE_ZCD {
+                0x0: decode CFUNCT6LOW3 {
+                    0x3: decode CFUNCT2LOW {
+                        0x1: CmMvsa01::cm_mvsa01();
+                        0x3: CmMva01s::cm_mva01s();
+                    }
+                    0x6: decode CFUNCT2MID {
+                        0x0: CmPush::cm_push();
+                        0x2: CmPop::cm_pop();
+                    }
+                    0x7: decode CFUNCT2MID {
+                        0x0: CmPop::cm_popretz(is_ret=True, has_a0=True);
+                        0x2: CmPop::cm_popret(is_ret=True);
+                    }
+                }
                 0x1: c_fsdsp({{
                     offset = CIMM6<5:3> << 3 |
                              CIMM6<2:0> << 6;
diff --git a/src/arch/riscv/isa/formats/formats.isa b/src/arch/riscv/isa/formats/formats.isa
index 0102df17d7..377bc5d061 100644
--- a/src/arch/riscv/isa/formats/formats.isa
+++ b/src/arch/riscv/isa/formats/formats.isa
@@ -40,6 +40,7 @@
 ##include "vector_conf.isa"
 ##include "vector_arith.isa"
 ##include "vector_mem.isa"
+##include "zcmp.isa"
 
 // Include formats for nonstandard extensions
 ##include "compressed.isa"
diff --git a/src/arch/riscv/isa/formats/zcmp.isa b/src/arch/riscv/isa/formats/zcmp.isa
new file mode 100644
index 0000000000..d8adbc5532
--- /dev/null
+++ b/src/arch/riscv/isa/formats/zcmp.isa
@@ -0,0 +1,772 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2015 RISC-V Foundation
+// Copyright (c) 2016 The University of Virginia
+// Copyright (c) 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Cmpush template.
+def template CmPushDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst);
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+    };
+}};
+
+
+def template CmPushConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst) :
+      %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        StaticInstPtr cur_inst = nullptr;
+        if (rlist < 4) {
+            cur_inst = new Unknown(machInst);
+            cur_inst->setFlag(IsMicroop);
+            microops.emplace_back(cur_inst);
+        } else {
+            int start_reg = 0;
+            if (rlist != 15) {
+                start_reg = (16-rlist);
+            }
+
+            int offset = 0;
+            for (int i = start_reg; i < PushPopRegList.size(); i++) {
+                offset -= rvSelect(4, 8);
+
+                if (machInst.rv_type == RV32) {
+                    cur_inst = new %(class_name)s32MicroInst(
+                        machInst, PushPopRegList[i], offset);
+                } else {
+                    cur_inst = new %(class_name)s64MicroInst(
+                        machInst, PushPopRegList[i], offset);
+                }
+                microops.emplace_back(cur_inst);
+            }
+
+            cur_inst = new %(class_name)sSpAdjMicroInst(machInst, -stackAdj());
+            microops.emplace_back(cur_inst);
+        }
+
+        microops.front()->setFirstMicroop();
+        microops.back()->setLastMicroop();
+    }
+}};
+
+def template CmPushExecute {{
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << getRlistStr() << ", " << (int64_t)-stackAdj();
+        return ss.str();
+    }
+}};
+
+def template CmStoreMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, RegId push_reg, int64_t offset);
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+        Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+        Fault completeAcc(
+            Packet *, ExecContext *, trace::InstRecord *) const override;
+        std::string generateDisassembly(
+            Addr, const loader::SymbolTable *) const override;
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+      private:
+        %(reg_idx_arr_decl)s;
+
+        int64_t offset;
+        Request::Flags memAccessFlags;
+    };
+}};
+
+def template CmStoreMicroConstructor {{
+    %(class_name)s::%(class_name)s(
+        ExtMachInst machInst, RegId push_reg, int64_t offset)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s),
+        offset(offset)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template CmStoreMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        Addr EA;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        %(memacc_code)s;
+
+        {
+            Fault fault =
+                writeMemAtomicLE(xc, traceData, Mem, EA, memAccessFlags,
+                        nullptr);
+            if (fault != NoFault)
+                return fault;
+        }
+
+        %(op_wb)s;
+
+        return NoFault;
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(srcRegIdx(1)) << ", " <<
+            offset << '(' << registerName(srcRegIdx(0)) << ')';
+        return ss.str();
+    }
+}};
+
+def template CmStoreMicroInitiateAcc {{
+    Fault
+    %(class_name)s::initiateAcc(ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        Addr EA;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        %(memacc_code)s;
+
+        {
+            Fault fault = writeMemTimingLE(xc, traceData, Mem, EA,
+                memAccessFlags, nullptr);
+            if (fault != NoFault)
+                return fault;
+        }
+
+        %(op_wb)s;
+
+        return NoFault;
+    }
+}};
+
+def template CmStoreMicroCompleteAcc {{
+    Fault
+    %(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        return NoFault;
+    }
+}};
+
+def template SpAdjMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, int64_t adj);
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+        std::string generateDisassembly(
+            Addr, const loader::SymbolTable *) const override;
+
+      private:
+        %(reg_idx_arr_decl)s;
+
+        int64_t adj;
+    };
+}};
+
+def template SpAdjMicroConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst, int64_t adj)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s), adj(adj)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template SpAdjMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        %(op_decl)s;
+        %(op_rd)s;
+        %(code)s;
+        %(op_wb)s;
+        return NoFault;
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ' '
+            << registerName(srcRegIdx(0)) << ' ' << adj;
+        return ss.str();
+    }
+}};
+
+// Cmpop decode template.
+def template CmPopDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst);
+
+        std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+    };
+}};
+
+
+def template CmPopConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst) :
+      %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        StaticInstPtr cur_inst = nullptr;
+        if (rlist < 4) {
+            cur_inst = new Unknown(machInst);
+            cur_inst->setFlag(IsMicroop);
+            microops.emplace_back(cur_inst);
+        } else {
+            int start_reg = 0;
+            if (rlist != 15) {
+                start_reg = (16-rlist);
+            }
+
+            int offset = stackAdj();
+            for (int i = start_reg; i < PushPopRegList.size(); i++) {
+                offset -= rvSelect(4, 8);
+
+                if (machInst.rv_type == RV32) {
+                    cur_inst = new %(class_name)s32MicroInst(
+                        machInst, PushPopRegList[i], offset);
+                } else {
+                    cur_inst = new %(class_name)s64MicroInst(
+                        machInst, PushPopRegList[i], offset);
+                }
+                microops.emplace_back(cur_inst);
+            }
+
+            cur_inst = new %(class_name)sSpAdjMicroInst(machInst, stackAdj());
+            microops.emplace_back(cur_inst);
+
+            %(move_a0_desc)s;
+            %(return_desc)s;
+        }
+
+        microops.front()->setFirstMicroop();
+        microops.back()->setLastMicroop();
+    }
+}};
+
+def template CmPopExecute {{
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << getRlistStr() << ", " << stackAdj();
+        return ss.str();
+    }
+}};
+
+def template CmLoadMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, RegId pop_reg, int64_t offset);
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+        Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+        Fault completeAcc(
+            Packet *, ExecContext *, trace::InstRecord *) const override;
+        std::string generateDisassembly(
+            Addr, const loader::SymbolTable *) const override;
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+      private:
+        %(reg_idx_arr_decl)s;
+
+        int64_t offset;
+        Request::Flags memAccessFlags;
+    };
+}};
+
+def template CmLoadMicroConstructor {{
+    %(class_name)s::%(class_name)s(
+        ExtMachInst machInst, RegId pop_reg, int64_t offset)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s),
+        offset(offset)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template CmLoadMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        Addr EA;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        {
+            Fault fault =
+                readMemAtomicLE(xc, traceData, EA, Mem, memAccessFlags);
+            if (fault != NoFault)
+                return fault;
+        }
+
+        %(memacc_code)s;
+
+        %(op_wb)s;
+
+        return NoFault;
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+            offset << '(' << registerName(srcRegIdx(0)) << ')';
+        return ss.str();
+    }
+}};
+
+def template CmLoadMicroInitiateAcc {{
+    Fault
+    %(class_name)s::initiateAcc(ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        Addr EA;
+
+        %(op_src_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        return initiateMemRead(xc, traceData, EA, Mem, memAccessFlags);
+    }
+}};
+
+def template CmLoadMicroCompleteAcc {{
+    Fault
+    %(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        %(op_decl)s;
+        %(op_rd)s;
+
+        getMemLE(pkt, Mem, traceData);
+
+        %(memacc_code)s;
+        %(op_wb)s;
+
+        return NoFault;
+    }
+}};
+
+def template CmRetMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst);
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+
+        std::string
+        generateDisassembly(
+                Addr pc, const loader::SymbolTable *symtab) const override;
+
+        std::unique_ptr<PCStateBase> branchTarget(
+                ThreadContext *tc) const override;
+
+        using StaticInst::branchTarget;
+
+      private:
+        %(reg_idx_arr_decl)s;
+    };
+}};
+
+def template CmRetMicroConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template CmRetMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        %(op_decl)s;
+        %(op_rd)s;
+        %(code)s;
+        %(op_wb)s;
+        return NoFault;
+    }
+
+    std::unique_ptr<PCStateBase>
+    %(class_name)s::branchTarget(ThreadContext *tc) const
+    {
+        PCStateBase *pc_ptr = tc->pcState().clone();
+        pc_ptr->as<PCState>().set(rvSext(tc->getReg(srcRegIdx(0)) & ~0x1));
+        return std::unique_ptr<PCStateBase>{pc_ptr};
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(srcRegIdx(0));
+        return ss.str();
+    }
+}};
+
+// Cmmvsa01 decode template
+def template CmMvDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst);
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+    };
+}};
+
+def template CmMvsa01Constructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        microops.emplace_back(
+            new %(class_name)sMvMicroInst(
+                machInst, int_reg::A0, StackRegs[machInst.r1s]));
+        microops.emplace_back(
+            new %(class_name)sMvMicroInst(
+                machInst, int_reg::A1, StackRegs[machInst.r2s]));
+
+        microops.front()->setFirstMicroop();
+        microops.back()->setLastMicroop();
+    }
+}};
+
+def template CmMva01sConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        microops.emplace_back(
+            new %(class_name)sMvMicroInst(
+                machInst, StackRegs[machInst.r1s], int_reg::A0));
+        microops.emplace_back(
+            new %(class_name)sMvMicroInst(
+                machInst, StackRegs[machInst.r2s], int_reg::A1));
+
+        microops.front()->setFirstMicroop();
+        microops.back()->setLastMicroop();
+    }
+}};
+
+def template CmMvExecute {{
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(StackRegs[machInst.r1s])
+            << ", " << registerName(StackRegs[machInst.r2s]);
+        return ss.str();
+    }
+}};
+
+def template CmMvMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, RegId push_reg, RegId pop_reg);
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+        std::string generateDisassembly(
+            Addr, const loader::SymbolTable *) const override;
+
+      private:
+        %(reg_idx_arr_decl)s;
+    };
+}};
+
+def template CmMvMicroConstructor {{
+    %(class_name)s::%(class_name)s(
+        ExtMachInst machInst, RegId push_reg, RegId pop_reg)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template CmMvMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        %(op_decl)s;
+        %(op_rd)s;
+        %(code)s;
+        %(op_wb)s;
+        return NoFault;
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ' '
+            << registerName(srcRegIdx(0));
+        return ss.str();
+    }
+}};
+
+def format CmPush(*flags) {{
+    code = ''
+    macro_iop = InstObjParams(name, Name, 'CmMacroInst', code, flags)
+    header_output = CmPushDeclare.subst(macro_iop)
+    decoder_output = CmPushConstructor.subst(macro_iop)
+    exec_output = CmPushExecute.subst(macro_iop)
+    decode_block = BasicDecode.subst(macro_iop)
+
+    memacc_code = 'Mem_sw = CmPushReg_sw;'
+    ea_code = 'EA = rvSext(sp + offset);'
+    micro32_iop = InstObjParams('lw', f'{Name}32MicroInst', 'RiscvMicroInst',
+        {'ea_code': ea_code, 'memacc_code': memacc_code},
+        flags)
+
+    mem_flags = [getAlignFlag(micro32_iop)]
+    s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+    micro32_iop.constructor += s
+
+    header_output += CmStoreMicroDeclare.subst(micro32_iop)
+    decoder_output += CmStoreMicroConstructor.subst(micro32_iop)
+    exec_output += CmStoreMicroExecute.subst(micro32_iop) \
+        + CmStoreMicroInitiateAcc.subst(micro32_iop) \
+        + CmStoreMicroCompleteAcc.subst(micro32_iop)
+
+    memacc_code = 'Mem = CmPushReg;'
+    ea_code = 'EA = rvSext(sp + offset);'
+    micro64_iop = InstObjParams('ld', f'{Name}64MicroInst', 'RiscvMicroInst',
+        {'ea_code': ea_code, 'memacc_code': memacc_code},
+        flags)
+
+    mem_flags = [getAlignFlag(micro64_iop)]
+    s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+    micro64_iop.constructor += s
+
+    header_output += CmStoreMicroDeclare.subst(micro64_iop)
+    decoder_output += CmStoreMicroConstructor.subst(micro64_iop)
+    exec_output += CmStoreMicroExecute.subst(micro64_iop) \
+        + CmStoreMicroInitiateAcc.subst(micro64_iop) \
+        + CmStoreMicroCompleteAcc.subst(micro64_iop)
+
+    code = 'spd = rvSext(sp + adj);'
+    sp_adj_iop = InstObjParams('addi', f'{Name}SpAdjMicroInst',
+        'RiscvMicroInst', code, flags)
+
+    header_output += SpAdjMicroDeclare.subst(sp_adj_iop)
+    decoder_output += SpAdjMicroConstructor.subst(sp_adj_iop)
+    exec_output += SpAdjMicroExecute.subst(sp_adj_iop)
+}};
+
+def format CmPop(is_ret=False, has_a0=False, *flags) {{
+    code = ''
+    flags = []
+    has_a0 = eval(has_a0)
+    is_ret = eval(is_ret)
+    move_a0_desc = ''
+    return_desc = ''
+
+    if has_a0:
+        move_a0_desc = rf'''
+          cur_inst = new {Name}MvMicroInst(
+              machInst, ReturnValueReg, int_reg::Zero);
+          microops.emplace_back(cur_inst);
+       '''
+
+    if is_ret:
+        return_desc = rf'''
+          cur_inst = new {Name}RetMicroInst(machInst);
+          microops.emplace_back(cur_inst);
+       '''
+
+    macro_iop = InstObjParams(name, Name, 'CmMacroInst',
+        {'code': code, 'move_a0_desc': move_a0_desc,
+         'return_desc': return_desc},
+        flags)
+    header_output = CmPopDeclare.subst(macro_iop)
+    decoder_output = CmPopConstructor.subst(macro_iop)
+    exec_output = CmPopExecute.subst(macro_iop)
+    decode_block = BasicDecode.subst(macro_iop)
+
+    memacc_code = 'CmPopReg_sw = Mem_sw;'
+    ea_code = 'EA = rvSext(sp + offset);'
+    micro32_iop = InstObjParams('lw', f'{Name}32MicroInst', 'RiscvMicroInst',
+        {'ea_code': ea_code, 'memacc_code': memacc_code},
+        flags)
+
+    mem_flags = [getAlignFlag(micro32_iop)]
+    s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+    micro32_iop.constructor += s
+
+    header_output += CmLoadMicroDeclare.subst(micro32_iop)
+    decoder_output += CmLoadMicroConstructor.subst(micro32_iop)
+    exec_output += CmLoadMicroExecute.subst(micro32_iop) \
+        + CmLoadMicroInitiateAcc.subst(micro32_iop) \
+        + CmLoadMicroCompleteAcc.subst(micro32_iop)
+
+    memacc_code = 'CmPopReg = Mem;'
+    ea_code = 'EA = rvSext(sp + offset);'
+    micro64_iop = InstObjParams('ld', f'{Name}64MicroInst', 'RiscvMicroInst',
+        {'ea_code': ea_code, 'memacc_code': memacc_code},
+        flags)
+
+    mem_flags = [getAlignFlag(micro64_iop)]
+    s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+    micro64_iop.constructor += s
+
+    header_output += CmLoadMicroDeclare.subst(micro64_iop)
+    decoder_output += CmLoadMicroConstructor.subst(micro64_iop)
+    exec_output += CmLoadMicroExecute.subst(micro64_iop) \
+        + CmLoadMicroInitiateAcc.subst(micro64_iop) \
+        + CmLoadMicroCompleteAcc.subst(micro64_iop)
+
+    code = 'spd = rvSext(sp + adj);'
+    sp_adj_iop = InstObjParams('addi', f'{Name}SpAdjMicroInst',
+        'RiscvMicroInst', code, flags)
+
+    header_output += SpAdjMicroDeclare.subst(sp_adj_iop)
+    decoder_output += SpAdjMicroConstructor.subst(sp_adj_iop)
+    exec_output += SpAdjMicroExecute.subst(sp_adj_iop)
+
+    if has_a0:
+        code = 'CmPopReg = CmPushReg;'
+        has_a0_iop = InstObjParams('mv', f'{Name}MvMicroInst',
+            'RiscvMicroInst', code, flags)
+
+        header_output += CmMvMicroDeclare.subst(has_a0_iop)
+        decoder_output += CmMvMicroConstructor.subst(has_a0_iop)
+        exec_output += CmMvMicroExecute.subst(has_a0_iop)
+
+    if is_ret:
+        code = 'NPC = rvSext(ra & (~0x1));'
+        ret_flags = ['IsIndirectControl', 'IsUncondControl', 'IsReturn']
+        is_ret_iop = InstObjParams('jr', f'{Name}RetMicroInst',
+            'RiscvMicroInst', code, ret_flags)
+
+        header_output += CmRetMicroDeclare.subst(is_ret_iop)
+        decoder_output += CmRetMicroConstructor.subst(is_ret_iop)
+        exec_output += CmRetMicroExecute.subst(is_ret_iop)
+}};
+
+def format CmMvsa01() {{
+    code = ''
+    flags = []
+    iop = InstObjParams(name, Name, 'RiscvMacroInst', code, flags)
+    header_output = CmMvDeclare.subst(iop)
+    decoder_output = CmMvsa01Constructor.subst(iop)
+    exec_output = CmMvExecute.subst(iop)
+    decode_block = BasicDecode.subst(iop)
+
+    code = 'CmPopReg = CmPushReg;'
+    micro_iop = InstObjParams('mv', f'{Name}MvMicroInst', 'RiscvMicroInst',
+        code, flags)
+
+    header_output += CmMvMicroDeclare.subst(micro_iop)
+    decoder_output += CmMvMicroConstructor.subst(micro_iop)
+    exec_output += CmMvMicroExecute.subst(micro_iop)
+}};
+
+def format CmMva01s() {{
+    code = ''
+    flags = []
+    iop = InstObjParams(name, Name, 'RiscvMacroInst', code, flags)
+    header_output = CmMvDeclare.subst(iop)
+    decoder_output = CmMva01sConstructor.subst(iop)
+    exec_output = CmMvExecute.subst(iop)
+    decode_block = BasicDecode.subst(iop)
+
+    code = 'CmPopReg = CmPushReg;'
+    micro_iop = InstObjParams('mv', f'{Name}MvMicroInst', 'RiscvMicroInst',
+        code, flags)
+
+    header_output += CmMvMicroDeclare.subst(micro_iop)
+    decoder_output += CmMvMicroConstructor.subst(micro_iop)
+    exec_output += CmMvMicroExecute.subst(micro_iop)
+}};
diff --git a/src/arch/riscv/isa/includes.isa b/src/arch/riscv/isa/includes.isa
index b37e62bca8..4d53958723 100644
--- a/src/arch/riscv/isa/includes.isa
+++ b/src/arch/riscv/isa/includes.isa
@@ -55,6 +55,7 @@ output header {{
 #include "arch/riscv/insts/static_inst.hh"
 #include "arch/riscv/insts/unknown.hh"
 #include "arch/riscv/insts/vector.hh"
+#include "arch/riscv/insts/zcmp.hh"
 #include "arch/riscv/interrupts.hh"
 #include "cpu/static_inst.hh"
 #include "mem/packet.hh"
diff --git a/src/arch/riscv/isa/operands.isa b/src/arch/riscv/isa/operands.isa
index de36d902b1..e2a7522b94 100644
--- a/src/arch/riscv/isa/operands.isa
+++ b/src/arch/riscv/isa/operands.isa
@@ -70,10 +70,14 @@ def operands {{
     'Rp2': IntReg('ud', 'RP2 + 8', 'IsInteger', 3),
     'ra': IntReg('ud', 'ReturnAddrReg', 'IsInteger', 1),
     'sp': IntReg('ud', 'StackPointerReg', 'IsInteger', 2),
+    'spd': IntReg('ud', 'StackPointerReg', 'IsInteger', 1),
 
     'a0': IntReg('ud', '10', 'IsInteger', 1),
     'a1': IntReg('ud', '11', 'IsInteger', 2),
 
+    'CmPushReg': IntReg('ud', 'push_reg', 'IsInteger', 3),
+    'CmPopReg': IntReg('ud', 'pop_reg', 'IsInteger', 1),
+
     'Fd': FloatRegOp('df', 'FD', 'IsFloating', 1),
     'Fd_bits': FloatRegOp('ud', 'FD', 'IsFloating', 1),
     'Fs1': FloatRegOp('df', 'FS1', 'IsFloating', 2),
diff --git a/src/arch/riscv/regs/int.hh b/src/arch/riscv/regs/int.hh
index 4ac01c60c1..dc7e37cdbe 100644
--- a/src/arch/riscv/regs/int.hh
+++ b/src/arch/riscv/regs/int.hh
@@ -149,6 +149,18 @@ inline constexpr RegId ArgumentRegs[] = {
     int_reg::A4, int_reg::A5, int_reg::A6, int_reg::A7
 };
 
+const std::vector<RegId> PushPopRegList = {
+    int_reg::S11, int_reg::S10, int_reg::S9, int_reg::S8,
+    int_reg::S7, int_reg::S6, int_reg::S5, int_reg::S4,
+    int_reg::S3, int_reg::S2, int_reg::S1, int_reg::S0,
+    int_reg::Ra
+};
+
+inline constexpr RegId StackRegs[] = {
+  int_reg::S0, int_reg::S1, int_reg::S2, int_reg::S3,
+  int_reg::S4, int_reg::S5, int_reg::S6, int_reg::S7,
+};
+
 } // namespace RiscvISA
 } // namespace gem5
 
diff --git a/src/arch/riscv/types.hh b/src/arch/riscv/types.hh
index 4bd3168804..8b72c782a9 100644
--- a/src/arch/riscv/types.hh
+++ b/src/arch/riscv/types.hh
@@ -127,6 +127,8 @@ BitUnion64(ExtMachInst)
     Bitfield< 6,  2>    rc2;
     Bitfield< 9,  7>    rp1;
     Bitfield< 4,  2>    rp2;
+    Bitfield< 9,  7>    r1s;
+    Bitfield< 4,  2>    r2s;
     Bitfield<11,  7>    fc1;
     Bitfield< 6,  2>    fc2;
     Bitfield< 4,  2>    fp2;
@@ -145,6 +147,8 @@ BitUnion64(ExtMachInst)
     Bitfield<12, 10>    cimm3;
     Bitfield< 6,  5>    cimm2;
     Bitfield<12>        cimm1;
+    Bitfield< 7,  4>    rlist;
+    Bitfield< 3,  2>    spimm;
     // Pseudo instructions
     Bitfield<31, 25>    m5func;
     // vector

From a6421e4404b85f3d9a50a5dc82b2552e08e61a7a Mon Sep 17 00:00:00 2001
From: Roger Chang <rogerycchang@google.com>
Date: Wed, 25 Sep 2024 13:59:35 +0800
Subject: [PATCH 38/47] arch-riscv: Add IsDelayedCommit for each zcmp micro
 instructions

---
 src/arch/riscv/isa/formats/zcmp.isa | 34 +++++++++++++++++++----------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/arch/riscv/isa/formats/zcmp.isa b/src/arch/riscv/isa/formats/zcmp.isa
index d8adbc5532..263c880022 100644
--- a/src/arch/riscv/isa/formats/zcmp.isa
+++ b/src/arch/riscv/isa/formats/zcmp.isa
@@ -52,6 +52,7 @@ def template CmPushConstructor {{
         if (rlist < 4) {
             cur_inst = new Unknown(machInst);
             cur_inst->setFlag(IsMicroop);
+            cur_inst->setDelayedCommit();
             microops.emplace_back(cur_inst);
         } else {
             int start_reg = 0;
@@ -70,10 +71,12 @@ def template CmPushConstructor {{
                     cur_inst = new %(class_name)s64MicroInst(
                         machInst, PushPopRegList[i], offset);
                 }
+                cur_inst->setDelayedCommit();
                 microops.emplace_back(cur_inst);
             }
 
             cur_inst = new %(class_name)sSpAdjMicroInst(machInst, -stackAdj());
+            cur_inst->setDelayedCommit();
             microops.emplace_back(cur_inst);
         }
 
@@ -275,6 +278,7 @@ def template CmPopConstructor {{
         if (rlist < 4) {
             cur_inst = new Unknown(machInst);
             cur_inst->setFlag(IsMicroop);
+            cur_inst->setDelayedCommit();
             microops.emplace_back(cur_inst);
         } else {
             int start_reg = 0;
@@ -293,10 +297,12 @@ def template CmPopConstructor {{
                     cur_inst = new %(class_name)s64MicroInst(
                         machInst, PushPopRegList[i], offset);
                 }
+                cur_inst->setDelayedCommit();
                 microops.emplace_back(cur_inst);
             }
 
             cur_inst = new %(class_name)sSpAdjMicroInst(machInst, stackAdj());
+            cur_inst->setDelayedCommit();
             microops.emplace_back(cur_inst);
 
             %(move_a0_desc)s;
@@ -506,12 +512,13 @@ def template CmMvsa01Constructor {{
     %(class_name)s::%(class_name)s(ExtMachInst machInst)
         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
     {
-        microops.emplace_back(
-            new %(class_name)sMvMicroInst(
-                machInst, int_reg::A0, StackRegs[machInst.r1s]));
-        microops.emplace_back(
-            new %(class_name)sMvMicroInst(
-                machInst, int_reg::A1, StackRegs[machInst.r2s]));
+        StaticInstPtr cur_inst;
+        cur_inst = new %(class_name)sMvMicroInst(
+            machInst, int_reg::A0, StackRegs[machInst.r1s]);
+        microops.emplace_back(cur_inst);
+        cur_inst = new %(class_name)sMvMicroInst(
+            machInst, int_reg::A1, StackRegs[machInst.r2s]);
+        microops.emplace_back(cur_inst);
 
         microops.front()->setFirstMicroop();
         microops.back()->setLastMicroop();
@@ -522,12 +529,15 @@ def template CmMva01sConstructor {{
     %(class_name)s::%(class_name)s(ExtMachInst machInst)
         : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
     {
-        microops.emplace_back(
-            new %(class_name)sMvMicroInst(
-                machInst, StackRegs[machInst.r1s], int_reg::A0));
-        microops.emplace_back(
-            new %(class_name)sMvMicroInst(
-                machInst, StackRegs[machInst.r2s], int_reg::A1));
+        StaticInstPtr cur_inst;
+        cur_inst = new %(class_name)sMvMicroInst(
+            machInst, StackRegs[machInst.r1s], int_reg::A0);
+        cur_inst->setDelayedCommit();
+        microops.emplace_back(cur_inst);
+        cur_inst = new %(class_name)sMvMicroInst(
+            machInst, StackRegs[machInst.r2s], int_reg::A1);
+        cur_inst->setDelayedCommit();
+        microops.emplace_back(cur_inst);
 
         microops.front()->setFirstMicroop();
         microops.back()->setLastMicroop();

From 3eabd02801e08592949310da8fef871ae5539242 Mon Sep 17 00:00:00 2001
From: Abhishek Shailendra Singh <a3.singh@samsung.com>
Date: Mon, 12 Aug 2024 12:29:32 -0500
Subject: [PATCH 39/47] mem-cache: This commit adds sms prefetcher

Change-Id: I68d3bb6cf07385177d0f776fb958f652cfc41489
---
 src/mem/cache/prefetch/Prefetcher.py |  16 +++
 src/mem/cache/prefetch/SConscript    |   6 +-
 src/mem/cache/prefetch/sms.cc        | 165 +++++++++++++++++++++++++++
 src/mem/cache/prefetch/sms.hh        |  82 +++++++++++++
 4 files changed, 267 insertions(+), 2 deletions(-)
 create mode 100644 src/mem/cache/prefetch/sms.cc
 create mode 100644 src/mem/cache/prefetch/sms.hh

diff --git a/src/mem/cache/prefetch/Prefetcher.py b/src/mem/cache/prefetch/Prefetcher.py
index 9864c922f6..85cc628d5b 100644
--- a/src/mem/cache/prefetch/Prefetcher.py
+++ b/src/mem/cache/prefetch/Prefetcher.py
@@ -599,6 +599,22 @@ class BOPPrefetcher(QueuedPrefetcher):
     on_inst = False
 
 
+class SmsPrefetcher(QueuedPrefetcher):
+    # Paper: https://web.eecs.umich.edu/~twenisch/papers/isca06.pdf
+    type = "SmsPrefetcher"
+    cxx_class = "gem5::prefetch::Sms"
+    cxx_header = "mem/cache/prefetch/sms.hh"
+    ft_size = Param.Unsigned(64, "Size of Filter and Active generation table")
+    pht_size = Param.Unsigned(16384, "Size of pattern history table")
+    region_size = Param.Unsigned(4096, "Spatial region size")
+
+    queue_squash = True
+    queue_filter = True
+    cache_snoop = True
+    prefetch_on_access = True
+    on_inst = False
+
+
 class SBOOEPrefetcher(QueuedPrefetcher):
     type = "SBOOEPrefetcher"
     cxx_class = "gem5::prefetch::SBOOE"
diff --git a/src/mem/cache/prefetch/SConscript b/src/mem/cache/prefetch/SConscript
index 8ce15e9688..fe048adacb 100644
--- a/src/mem/cache/prefetch/SConscript
+++ b/src/mem/cache/prefetch/SConscript
@@ -31,8 +31,9 @@ Import('*')
 SimObject('Prefetcher.py', sim_objects=[
     'BasePrefetcher', 'MultiPrefetcher', 'QueuedPrefetcher',
     'StridePrefetcherHashedSetAssociative', 'StridePrefetcher',
-    'TaggedPrefetcher', 'IndirectMemoryPrefetcher', 'SignaturePathPrefetcher',
-    'SignaturePathPrefetcherV2', 'AccessMapPatternMatching', 'AMPMPrefetcher',
+    'SmsPrefetcher', 'TaggedPrefetcher', 'IndirectMemoryPrefetcher', 
+    'SignaturePathPrefetcher', 'SignaturePathPrefetcherV2', 
+    'AccessMapPatternMatching', 'AMPMPrefetcher',
     'DeltaCorrelatingPredictionTables', 'DCPTPrefetcher',
     'IrregularStreamBufferPrefetcher', 'SlimAMPMPrefetcher',
     'BOPPrefetcher', 'SBOOEPrefetcher', 'STeMSPrefetcher', 'PIFPrefetcher'])
@@ -47,6 +48,7 @@ Source('indirect_memory.cc')
 Source('pif.cc')
 Source('queued.cc')
 Source('sbooe.cc')
+Source('sms.cc')
 Source('signature_path.cc')
 Source('signature_path_v2.cc')
 Source('slim_ampm.cc')
diff --git a/src/mem/cache/prefetch/sms.cc b/src/mem/cache/prefetch/sms.cc
new file mode 100644
index 0000000000..8173fa59bc
--- /dev/null
+++ b/src/mem/cache/prefetch/sms.cc
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Describes a SMS prefetcher based on template policies.
+ */
+
+#include "mem/cache/prefetch/sms.hh"
+
+#include "debug/HWPrefetch.hh"
+#include "params/SmsPrefetcher.hh"
+
+namespace gem5
+{
+
+namespace prefetch
+{
+
+Sms::Sms(const SmsPrefetcherParams &p)
+    : Queued(p), Max_Contexts(p.ft_size), MAX_PHTSize(p.pht_size),
+      Region_Size(p.region_size)
+{
+        AGT.clear();
+        AGTPC.clear();
+        FT.clear();
+        PHT.clear();
+        fifoFT.clear();
+        lruAGT.clear();
+        lruPHT.clear();
+
+}
+void
+Sms::notifyEvict(const EvictionInfo &info)
+{
+    //Check if any active generation has ended
+    Addr regionBase = roundDown(info.addr, Region_Size);
+    std::pair <Addr,Addr> pc_offset = AGTPC[regionBase];
+    if (AGT.find(regionBase) != AGT.end()) {
+        //remove old recording
+        if (PHT.find(pc_offset) != PHT.end()) {
+            PHT[pc_offset].clear();
+        }
+        //Move from AGT to PHT
+        for (std::set<Addr>::iterator it = AGT[regionBase].begin();
+         it != AGT[regionBase].end(); it ++) {
+            PHT[pc_offset].insert(*it);
+        }
+        lruPHT.push_front(pc_offset);
+    }
+
+    while (PHT.size() > MAX_PHTSize) {
+        PHT.erase(lruPHT.back());
+        lruPHT.pop_back();
+    }
+
+    AGTPC.erase(regionBase);
+    AGT.erase(regionBase);
+
+
+}
+void
+Sms::calculatePrefetch(const PrefetchInfo &pfi,
+    std::vector<AddrPriority> &addresses,
+    const CacheAccessor &cache)
+{
+
+    if (!pfi.hasPC()) {
+        DPRINTF(HWPrefetch, "Ignoring request with no PC.\n");
+        return;
+    }
+
+    Addr blk_addr = blockAddress(pfi.getAddr());
+    Addr pc = pfi.getPC();
+    Addr regionBase = roundDown(blk_addr, Region_Size);
+    Addr offset = blk_addr - regionBase;
+
+    //Training
+    if (AGT.find(regionBase) != AGT.end()) {
+        assert (FT.find(regionBase) == FT.end());
+        // Record Pattern
+        AGT[regionBase].insert(offset);
+        //update LRU
+        for (std::deque <Addr>::iterator lit = lruAGT.begin();
+         lit != lruAGT.end(); lit ++) {
+            if ((*lit) == regionBase) {
+                lruAGT.erase(lit);
+                lruAGT.push_front(regionBase);
+                break;
+            }
+        }
+    }
+    else if (FT.find(regionBase) != FT.end()) {
+        //move entry from FT to AGT
+        AGT[regionBase].insert(FT[regionBase].second);
+        AGTPC[regionBase] = FT[regionBase];
+        lruAGT.push_front(regionBase);
+        //Record latest offset
+        AGT[regionBase].insert(offset);
+        //Recycle FT entry
+        FT.erase(regionBase);
+        //Make space for next entry
+        while (AGT.size() > Max_Contexts) {
+            AGT.erase(lruAGT.back());
+            AGTPC.erase(lruAGT.back());
+            lruAGT.pop_back();
+        }
+    }
+    else {
+        // Trigger Access
+        FT[regionBase] = std::make_pair (pc,offset);
+        fifoFT.push_front(regionBase);
+        while (FT.size() > Max_Contexts) {
+            FT.erase(fifoFT.back());
+            fifoFT.pop_back();
+        }
+    }
+
+    //Prediction
+    std::pair <Addr, Addr> pc_offset = std::make_pair(pc,offset);
+    if (PHT.find(pc_offset) != PHT.end()) {
+        for (std::set<Addr>::iterator it = PHT[pc_offset].begin();
+         it != PHT[pc_offset].end(); it ++) {
+            Addr prefAddr = blockAddress(regionBase + (*it));
+            addresses.push_back(AddrPriority(prefAddr,0));
+        }
+        for (std::deque < std::pair <Addr,Addr> >::iterator lit
+         = lruPHT.begin(); lit != lruPHT.end(); lit ++) {
+            if ((*lit) == pc_offset) {
+                    lruPHT.erase(lit);
+                    lruPHT.push_front(pc_offset);
+                    break;
+            }
+        }
+    }
+
+}
+
+} // namespace prefetch
+} // namespace gem5
diff --git a/src/mem/cache/prefetch/sms.hh b/src/mem/cache/prefetch/sms.hh
new file mode 100644
index 0000000000..4bda1694dd
--- /dev/null
+++ b/src/mem/cache/prefetch/sms.hh
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Describes a SMS prefetcher.
+ */
+
+#ifndef __MEM_CACHE_PREFETCH_SMS_HH__
+#define __MEM_CACHE_PREFETCH_SMS_HH__
+
+#include <set>
+
+#include "mem/cache/prefetch/queued.hh"
+#include "mem/packet.hh"
+
+namespace gem5
+{
+
+struct SmsPrefetcherParams;
+
+namespace prefetch
+{
+
+
+class Sms : public Queued
+{
+
+  private:
+    const int Max_Contexts; //= 64;
+    const uint64_t MAX_PHTSize; //= 512;
+    const Addr Region_Size; //= 4096;
+
+    std::map< Addr, std::set<Addr> > AGT;
+    std::map< Addr, std::pair<Addr,Addr> > AGTPC;
+    std::map< Addr, std::pair<Addr,Addr> > FT;
+    std::map< std::pair <Addr,Addr> , std::set<Addr> > PHT;
+    std::deque<Addr> fifoFT;
+    std::deque<Addr> lruAGT;
+    std::deque< std::pair <Addr,Addr> > lruPHT;
+
+    using EvictionInfo = CacheDataUpdateProbeArg;
+    void notifyEvict(const EvictionInfo &info) override;
+
+  public:
+    Sms(const SmsPrefetcherParams &p);
+    ~Sms() = default;
+
+    void calculatePrefetch(const PrefetchInfo &pfi,
+                           std::vector<AddrPriority> &addresses,
+                           const CacheAccessor &cache) override;
+};
+
+} // namespace prefetch
+} // namespace gem5
+
+#endif // __MEM_CACHE_PREFETCH_SMS_HH__

From bd939821c800099663e73ba833075dc3251bb691 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 15 Aug 2024 18:33:00 +0000
Subject: [PATCH 40/47] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/mem/cache/prefetch/SConscript | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mem/cache/prefetch/SConscript b/src/mem/cache/prefetch/SConscript
index fe048adacb..c971b7541b 100644
--- a/src/mem/cache/prefetch/SConscript
+++ b/src/mem/cache/prefetch/SConscript
@@ -31,8 +31,8 @@ Import('*')
 SimObject('Prefetcher.py', sim_objects=[
     'BasePrefetcher', 'MultiPrefetcher', 'QueuedPrefetcher',
     'StridePrefetcherHashedSetAssociative', 'StridePrefetcher',
-    'SmsPrefetcher', 'TaggedPrefetcher', 'IndirectMemoryPrefetcher', 
-    'SignaturePathPrefetcher', 'SignaturePathPrefetcherV2', 
+    'SmsPrefetcher', 'TaggedPrefetcher', 'IndirectMemoryPrefetcher',
+    'SignaturePathPrefetcher', 'SignaturePathPrefetcherV2',
     'AccessMapPatternMatching', 'AMPMPrefetcher',
     'DeltaCorrelatingPredictionTables', 'DCPTPrefetcher',
     'IrregularStreamBufferPrefetcher', 'SlimAMPMPrefetcher',

From cf3427f87bc46aaa7b7f613498af0c5c4f33af17 Mon Sep 17 00:00:00 2001
From: Abhishek Shailendra Singh <a3.singh@samsung.com>
Date: Mon, 26 Aug 2024 13:14:06 -0500
Subject: [PATCH 41/47] mem-cache: refactored the code

---
 src/mem/cache/prefetch/sms.cc | 68 +++++++++++++++++------------------
 1 file changed, 32 insertions(+), 36 deletions(-)

diff --git a/src/mem/cache/prefetch/sms.cc b/src/mem/cache/prefetch/sms.cc
index 8173fa59bc..2ad4ef92e3 100644
--- a/src/mem/cache/prefetch/sms.cc
+++ b/src/mem/cache/prefetch/sms.cc
@@ -46,29 +46,29 @@ Sms::Sms(const SmsPrefetcherParams &p)
     : Queued(p), Max_Contexts(p.ft_size), MAX_PHTSize(p.pht_size),
       Region_Size(p.region_size)
 {
-        AGT.clear();
-        AGTPC.clear();
-        FT.clear();
-        PHT.clear();
-        fifoFT.clear();
-        lruAGT.clear();
-        lruPHT.clear();
+    AGT.clear();
+    AGTPC.clear();
+    FT.clear();
+    PHT.clear();
+    fifoFT.clear();
+    lruAGT.clear();
+    lruPHT.clear();
 
 }
 void
 Sms::notifyEvict(const EvictionInfo &info)
 {
     //Check if any active generation has ended
-    Addr regionBase = roundDown(info.addr, Region_Size);
-    std::pair <Addr,Addr> pc_offset = AGTPC[regionBase];
-    if (AGT.find(regionBase) != AGT.end()) {
+    Addr region_base = roundDown(info.addr, Region_Size);
+    std::pair <Addr,Addr> pc_offset = AGTPC[region_base];
+    if (AGT.find(region_base) != AGT.end()) {
         //remove old recording
         if (PHT.find(pc_offset) != PHT.end()) {
             PHT[pc_offset].clear();
         }
         //Move from AGT to PHT
-        for (std::set<Addr>::iterator it = AGT[regionBase].begin();
-         it != AGT[regionBase].end(); it ++) {
+        for (std::set<Addr>::iterator it = AGT[region_base].begin();
+         it != AGT[region_base].end(); it ++) {
             PHT[pc_offset].insert(*it);
         }
         lruPHT.push_front(pc_offset);
@@ -79,10 +79,8 @@ Sms::notifyEvict(const EvictionInfo &info)
         lruPHT.pop_back();
     }
 
-    AGTPC.erase(regionBase);
-    AGT.erase(regionBase);
-
-
+    AGTPC.erase(region_base);
+    AGT.erase(region_base);
 }
 void
 Sms::calculatePrefetch(const PrefetchInfo &pfi,
@@ -97,44 +95,42 @@ Sms::calculatePrefetch(const PrefetchInfo &pfi,
 
     Addr blk_addr = blockAddress(pfi.getAddr());
     Addr pc = pfi.getPC();
-    Addr regionBase = roundDown(blk_addr, Region_Size);
-    Addr offset = blk_addr - regionBase;
+    Addr region_base = roundDown(blk_addr, Region_Size);
+    Addr offset = blk_addr - region_base;
 
     //Training
-    if (AGT.find(regionBase) != AGT.end()) {
-        assert (FT.find(regionBase) == FT.end());
+    if (AGT.find(region_base) != AGT.end()) {
+        assert (FT.find(region_base) == FT.end());
         // Record Pattern
-        AGT[regionBase].insert(offset);
+        AGT[region_base].insert(offset);
         //update LRU
         for (std::deque <Addr>::iterator lit = lruAGT.begin();
          lit != lruAGT.end(); lit ++) {
-            if ((*lit) == regionBase) {
+            if ((*lit) == region_base) {
                 lruAGT.erase(lit);
-                lruAGT.push_front(regionBase);
+                lruAGT.push_front(region_base);
                 break;
             }
         }
-    }
-    else if (FT.find(regionBase) != FT.end()) {
+    } else if (FT.find(region_base) != FT.end()) {
         //move entry from FT to AGT
-        AGT[regionBase].insert(FT[regionBase].second);
-        AGTPC[regionBase] = FT[regionBase];
-        lruAGT.push_front(regionBase);
+        AGT[region_base].insert(FT[region_base].second);
+        AGTPC[region_base] = FT[region_base];
+        lruAGT.push_front(region_base);
         //Record latest offset
-        AGT[regionBase].insert(offset);
+        AGT[region_base].insert(offset);
         //Recycle FT entry
-        FT.erase(regionBase);
+        FT.erase(region_base);
         //Make space for next entry
         while (AGT.size() > Max_Contexts) {
             AGT.erase(lruAGT.back());
             AGTPC.erase(lruAGT.back());
             lruAGT.pop_back();
         }
-    }
-    else {
+    } else {
         // Trigger Access
-        FT[regionBase] = std::make_pair (pc,offset);
-        fifoFT.push_front(regionBase);
+        FT[region_base] = std::make_pair (pc,offset);
+        fifoFT.push_front(region_base);
         while (FT.size() > Max_Contexts) {
             FT.erase(fifoFT.back());
             fifoFT.pop_back();
@@ -146,8 +142,8 @@ Sms::calculatePrefetch(const PrefetchInfo &pfi,
     if (PHT.find(pc_offset) != PHT.end()) {
         for (std::set<Addr>::iterator it = PHT[pc_offset].begin();
          it != PHT[pc_offset].end(); it ++) {
-            Addr prefAddr = blockAddress(regionBase + (*it));
-            addresses.push_back(AddrPriority(prefAddr,0));
+            Addr pref_addr = blockAddress(region_base + (*it));
+            addresses.push_back(AddrPriority(pref_addr,0));
         }
         for (std::deque < std::pair <Addr,Addr> >::iterator lit
          = lruPHT.begin(); lit != lruPHT.end(); lit ++) {

From f55a4ce98960d4d624fb211a2b85567f0abe2bef Mon Sep 17 00:00:00 2001
From: Jason Lowe-Power <jason@lowepower.com>
Date: Thu, 17 Oct 2024 08:17:34 -0700
Subject: [PATCH 42/47] arch-x86,arch-arm: Remove static variables in decoders
 (#1643)

There were a number of variables in the arm and x86 decoders that are
static (e.g., the decode cache). It's a bit interesting that this
doesn't cause problems with multiple cores since each core has its own
decoder.

However, this causes segfaults if you run different cores on different
*host* threads. We are experimenting with running gem5 with multiple
host thread (i.e., in parallel), and removing these static variables
resolves the segfault.

This change also adds const to any other static variables to ensure that
they cannot be modified.

Signed-off-by: Jason Lowe-Power <jason@lowepower.com>
---
 src/arch/arm/decoder.cc |  2 --
 src/arch/arm/decoder.hh |  2 +-
 src/arch/x86/decoder.cc |  5 -----
 src/arch/x86/decoder.hh | 24 ++++++++++++------------
 4 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/src/arch/arm/decoder.cc b/src/arch/arm/decoder.cc
index 9fc4be0e9a..3e898c5a47 100644
--- a/src/arch/arm/decoder.cc
+++ b/src/arch/arm/decoder.cc
@@ -53,8 +53,6 @@ namespace gem5
 namespace ArmISA
 {
 
-GenericISA::BasicDecodeCache<Decoder, ExtMachInst> Decoder::defaultCache;
-
 Decoder::Decoder(const ArmDecoderParams &params)
     : InstDecoder(params, &data),
       dvmEnabled(params.dvm_enabled),
diff --git a/src/arch/arm/decoder.hh b/src/arch/arm/decoder.hh
index 75488b6750..57c29546ae 100644
--- a/src/arch/arm/decoder.hh
+++ b/src/arch/arm/decoder.hh
@@ -94,7 +94,7 @@ class Decoder : public InstDecoder
     enums::DecoderFlavor decoderFlavor;
 
     /// A cache of decoded instruction objects.
-    static GenericISA::BasicDecodeCache<Decoder, ExtMachInst> defaultCache;
+    GenericISA::BasicDecodeCache<Decoder, ExtMachInst> defaultCache;
     friend class GenericISA::BasicDecodeCache<Decoder, ExtMachInst>;
 
     /**
diff --git a/src/arch/x86/decoder.cc b/src/arch/x86/decoder.cc
index af2456d6ab..ec595856a2 100644
--- a/src/arch/x86/decoder.cc
+++ b/src/arch/x86/decoder.cc
@@ -41,8 +41,6 @@ namespace gem5
 namespace X86ISA
 {
 
-X86ISAInst::MicrocodeRom Decoder::microcodeRom;
-
 Decoder::State
 Decoder::doResetState()
 {
@@ -671,9 +669,6 @@ Decoder::doImmediateState()
     return nextState;
 }
 
-Decoder::InstBytes Decoder::dummy;
-Decoder::InstCacheMap Decoder::instCacheMap;
-
 StaticInstPtr
 Decoder::decode(ExtMachInst mach_inst, Addr addr)
 {
diff --git a/src/arch/x86/decoder.hh b/src/arch/x86/decoder.hh
index e4b1de96d7..eee48c1f76 100644
--- a/src/arch/x86/decoder.hh
+++ b/src/arch/x86/decoder.hh
@@ -60,19 +60,19 @@ class Decoder : public InstDecoder
     // These are defined and documented in decoder_tables.cc
     static const uint8_t SizeTypeToSize[3][10];
     typedef const uint8_t ByteTable[256];
-    static ByteTable Prefixes[2];
+    static const ByteTable Prefixes[2];
 
-    static ByteTable UsesModRMOneByte;
-    static ByteTable UsesModRMTwoByte;
-    static ByteTable UsesModRMThreeByte0F38;
-    static ByteTable UsesModRMThreeByte0F3A;
+    static const ByteTable UsesModRMOneByte;
+    static const ByteTable UsesModRMTwoByte;
+    static const ByteTable UsesModRMThreeByte0F38;
+    static const ByteTable UsesModRMThreeByte0F3A;
 
-    static ByteTable ImmediateTypeOneByte;
-    static ByteTable ImmediateTypeTwoByte;
-    static ByteTable ImmediateTypeThreeByte0F38;
-    static ByteTable ImmediateTypeThreeByte0F3A;
+    static const ByteTable ImmediateTypeOneByte;
+    static const ByteTable ImmediateTypeTwoByte;
+    static const ByteTable ImmediateTypeThreeByte0F38;
+    static const ByteTable ImmediateTypeThreeByte0F3A;
 
-    static X86ISAInst::MicrocodeRom microcodeRom;
+    X86ISAInst::MicrocodeRom microcodeRom;
 
   protected:
     using MachInst = uint64_t;
@@ -88,7 +88,7 @@ class Decoder : public InstDecoder
         {}
     };
 
-    static InstBytes dummy;
+    InstBytes dummy;
 
     // The bytes to be predecoded.
     MachInst fetchChunk;
@@ -244,7 +244,7 @@ class Decoder : public InstDecoder
     decode_cache::InstMap<ExtMachInst> *instMap = nullptr;
     typedef std::unordered_map<
             CacheKey, decode_cache::InstMap<ExtMachInst> *> InstCacheMap;
-    static InstCacheMap instCacheMap;
+    InstCacheMap instCacheMap;
 
     StaticInstPtr decodeInst(ExtMachInst mach_inst);
 

From d454e421d231246a443231c7d94e0761feabf0ec Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Thu, 17 Oct 2024 10:29:17 -0700
Subject: [PATCH 43/47] stdlib,arch-x86: Update X86Demoboard (#1618)

This commit modifies X86DemoBoard so it has numbers more similar to that
of RiscvDemoBoard and ArmDemoBoard. It also adds SE mode to
X86DemoBoard. Note that the changes here depend on the changes in PR
1579.

**Note**: This PR was created so @BobbyRBruce could add his commits to
#1600

---------

Co-authored-by: Erin Le <ejle@ucdavis.edu>
---
 .../gem5/prebuilt/demo/x86_demo_board.py      | 85 ++++++++++++++-----
 1 file changed, 65 insertions(+), 20 deletions(-)

diff --git a/src/python/gem5/prebuilt/demo/x86_demo_board.py b/src/python/gem5/prebuilt/demo/x86_demo_board.py
index 793b43a3d1..ac89847f2b 100644
--- a/src/python/gem5/prebuilt/demo/x86_demo_board.py
+++ b/src/python/gem5/prebuilt/demo/x86_demo_board.py
@@ -24,27 +24,33 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from m5.objects import (
+    IOXBar,
+    Pc,
+    Port,
+    X86FsLinux,
+)
 from m5.util import warn
 
-from ...coherence_protocol import CoherenceProtocol
+from ...components.boards.se_binary_workload import SEBinaryWorkload
 from ...components.boards.x86_board import X86Board
-from ...components.cachehierarchies.ruby.mesi_two_level_cache_hierarchy import (
-    MESITwoLevelCacheHierarchy,
+from ...components.cachehierarchies.classic.private_l1_shared_l2_cache_hierarchy import (
+    PrivateL1SharedL2CacheHierarchy,
 )
-from ...components.memory.single_channel import SingleChannelDDR3_1600
+from ...components.memory.multi_channel import DualChannelDDR4_2400
 from ...components.processors.cpu_types import CPUTypes
 from ...components.processors.simple_processor import SimpleProcessor
 from ...isas import ISA
+from ...utils.override import overrides
 from ...utils.requires import requires
 
 
-class X86DemoBoard(X86Board):
+class X86DemoBoard(X86Board, SEBinaryWorkload):
     """
     This prebuilt X86 board is used for demonstration purposes. It simulates
-    an X86 3GHz quad-core system with a 2GiB DDR3_1600 memory system. A
-    MESI_Two_Level cache hierarchy is set with an l1 data and instruction
-    cache, each 32KiB with an associativity of 8, and a single bank l2 cache of
-    1MiB with an associativity of 16.
+    an X86 3GHz dual-core system with a 3GiB DDR4_2400 memory system. The
+    cache hierarchy consists of per-core private L1 instruction and data
+    caches (64KiB each) connected to a shared 8MiB L2 cache.
 
     **DISCLAIMER**: This board is solely for demonstration purposes. This board
     is not known to be representative of any real-world system or produce
@@ -68,7 +74,6 @@ class X86DemoBoard(X86Board):
     def __init__(self):
         requires(
             isa_required=ISA.X86,
-            coherence_protocol_required=CoherenceProtocol.MESI_TWO_LEVEL,
         )
 
         warn(
@@ -77,18 +82,15 @@ class X86DemoBoard(X86Board):
             "real-world system. Use with caution."
         )
 
-        memory = SingleChannelDDR3_1600(size="2GiB")
+        # The other demo boards have 4 GiB of memory, but X86Board can only
+        # support up to 3 GiB.
+        memory = DualChannelDDR4_2400(size="3GiB")
         processor = SimpleProcessor(
-            cpu_type=CPUTypes.TIMING, isa=ISA.X86, num_cores=4
+            cpu_type=CPUTypes.TIMING, isa=ISA.X86, num_cores=2
         )
-        cache_hierarchy = MESITwoLevelCacheHierarchy(
-            l1d_size="32KiB",
-            l1d_assoc=8,
-            l1i_size="32KiB",
-            l1i_assoc=8,
-            l2_size="1MiB",
-            l2_assoc=16,
-            num_l2_banks=1,
+
+        cache_hierarchy = PrivateL1SharedL2CacheHierarchy(
+            l1d_size="64KiB", l1i_size="64KiB", l2_size="8MiB"
         )
 
         super().__init__(
@@ -97,3 +99,46 @@ class X86DemoBoard(X86Board):
             memory=memory,
             cache_hierarchy=cache_hierarchy,
         )
+
+    @overrides(X86Board)
+    def _setup_board(self) -> None:
+        if self._is_fs:
+            self.pc = Pc()
+
+            self.workload = X86FsLinux()
+
+            # North Bridge
+            self.iobus = IOXBar()
+
+            # Set up all of the I/O.
+            self._setup_io_devices()
+
+            self.m5ops_base = 0xFFFF0000
+
+    @overrides(X86Board)
+    def has_io_bus(self) -> bool:
+        return self.is_fullsystem()
+
+    @overrides(X86Board)
+    def get_io_bus(self) -> IOXBar:
+        if self.has_io_bus():
+            return self.iobus
+        else:
+            raise NotImplementedError(
+                "X86DemoBoard does not have an IO bus. "
+                "Use `has_io_bus()` to check this."
+            )
+
+    @overrides(X86Board)
+    def has_coherent_io(self) -> bool:
+        return self.is_fullsystem()
+
+    @overrides(X86Board)
+    def get_mem_side_coherent_io_port(self) -> Port:
+        if self.has_coherent_io():
+            return self.iobus.mem_side_ports
+        else:
+            raise NotImplementedError(
+                "x86DemoBoard does not have any I/O ports. Use has_coherent_io"
+                " to check this."
+            )

From 7591f2a84378c4810f58459b4e39d74457045405 Mon Sep 17 00:00:00 2001
From: Harshil Patel <hpppatel@ucdavis.edu>
Date: Thu, 17 Oct 2024 11:19:46 -0700
Subject: [PATCH 44/47] tests: Fix compiler tests (#1678)

- This change updates syntax of constructors of Template Classes from
`class<T>()` to `class()`

- Initializes coherence to 0 in `src/mem/cache_blk.hh`

The above changes are made to solve the errors when compiling gem5 in
gcc 14
---
 src/arch/arm/faults.hh     | 2 +-
 src/base/stats/units.hh    | 4 ++--
 src/dev/virtio/base.hh     | 4 ++--
 src/mem/cache/cache_blk.hh | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/arch/arm/faults.hh b/src/arch/arm/faults.hh
index a76439574a..bcd067c284 100644
--- a/src/arch/arm/faults.hh
+++ b/src/arch/arm/faults.hh
@@ -264,7 +264,7 @@ class ArmFaultVals : public ArmFault
     static FaultVals vals;
 
   public:
-    ArmFaultVals<T>(ExtMachInst mach_inst = 0, uint32_t _iss = 0) :
+    ArmFaultVals(ExtMachInst mach_inst = 0, uint32_t _iss = 0) :
         ArmFault(mach_inst, _iss) {}
     FaultName name() const override { return vals.name; }
     FaultOffset offset(ThreadContext *tc) override;
diff --git a/src/base/stats/units.hh b/src/base/stats/units.hh
index 1d7d640ddb..eb4bcd240f 100644
--- a/src/base/stats/units.hh
+++ b/src/base/stats/units.hh
@@ -350,9 +350,9 @@ class Rate : public Base
         "otherwise, it would be a Ratio");
 
   private:
-    Rate<T1,T2>() {}
+    Rate() {}
   public:
-    Rate<T1,T2>(Rate<T1,T2> const&) = delete;
+    Rate(Rate const&) = delete;
     void operator=(Rate<T1,T2> const&) = delete;
     static Rate<T1,T2>*
     get()
diff --git a/src/dev/virtio/base.hh b/src/dev/virtio/base.hh
index 41ebb741d1..c31cd298b9 100644
--- a/src/dev/virtio/base.hh
+++ b/src/dev/virtio/base.hh
@@ -477,7 +477,7 @@ class VirtQueue : public Serializable
             Index index;
         };
 
-        VirtRing<T>(PortProxy &proxy, ByteOrder bo, uint16_t size) :
+        VirtRing(PortProxy &proxy, ByteOrder bo, uint16_t size) :
             header{0, 0}, ring(size), _proxy(proxy), _base(0), byteOrder(bo)
         {}
 
@@ -550,7 +550,7 @@ class VirtQueue : public Serializable
 
       private:
         // Remove default constructor
-        VirtRing<T>();
+        VirtRing();
 
         /** Guest physical memory proxy */
         PortProxy &_proxy;
diff --git a/src/mem/cache/cache_blk.hh b/src/mem/cache/cache_blk.hh
index 2b24828259..a2027f25f1 100644
--- a/src/mem/cache/cache_blk.hh
+++ b/src/mem/cache/cache_blk.hh
@@ -461,7 +461,7 @@ class CacheBlk : public TaggedEntry
 
   protected:
     /** The current coherence status of this block. @sa CoherenceBits */
-    unsigned coherence;
+    unsigned coherence = 0;
 
     // The following setters have been marked as protected because their
     // respective variables should only be modified at 2 moments:

From 946bf83b75205f11c3c6cdaba274caa4a9e16046 Mon Sep 17 00:00:00 2001
From: Harshil Patel <hpppatel@ucdavis.edu>
Date: Fri, 18 Oct 2024 05:36:31 -0700
Subject: [PATCH 45/47] arch-arm: Add arm demo board (#1478)

This demo board is a preset arm board, that can be used to run example
gem5 simulations. This board doesnt simulate any known hardware.

The board will be used to run benchmarks such as gapbs and npb to
collect stats. The plan is to show these stats on the gem5 resources
website to provide more details about the resources.
---
 .../gem5_library/arm-demo-ubuntu-run.py       |  92 ++++++++++++++
 src/python/SConscript                         |   1 +
 .../gem5/prebuilt/demo/arm_demo_board.py      | 112 ++++++++++++++++++
 3 files changed, 205 insertions(+)
 create mode 100644 configs/example/gem5_library/arm-demo-ubuntu-run.py
 create mode 100644 src/python/gem5/prebuilt/demo/arm_demo_board.py

diff --git a/configs/example/gem5_library/arm-demo-ubuntu-run.py b/configs/example/gem5_library/arm-demo-ubuntu-run.py
new file mode 100644
index 0000000000..9b39c34330
--- /dev/null
+++ b/configs/example/gem5_library/arm-demo-ubuntu-run.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+This script further shows an example of booting an ARM based full system Ubuntu
+disk image. This simulation boots the disk image using the ArmDemoBoard.
+
+Usage
+-----
+
+```bash
+scons build/ARM/gem5.opt -j $(nproc)
+./build/ARM/gem5.opt configs/example/gem5_library/arm-demo-ubuntu-run.py
+```
+"""
+import argparse
+
+from gem5.isas import ISA
+from gem5.prebuilt.demo.arm_demo_board import ArmDemoBoard
+from gem5.resources.resource import obtain_resource
+from gem5.simulate.exit_event import ExitEvent
+from gem5.simulate.simulator import Simulator
+from gem5.utils.requires import requires
+
+# This runs a check to ensure the gem5 binary interpreting this file is compiled to include the ARM ISA.
+requires(isa_required=ISA.ARM)
+
+parser = argparse.ArgumentParser(
+    description="An example configuration script to run the ArmDemoBoard."
+)
+
+parser.add_argument(
+    "--use-kvm",
+    action="store_true",
+    help="Use KVM cores instead of Timing.",
+)
+args = parser.parse_args()
+
+board = ArmDemoBoard(use_kvm=args.use_kvm)
+
+board.set_workload(
+    obtain_resource(
+        "arm-ubuntu-24.04-boot-with-systemd", resource_version="2.0.0"
+    )
+)
+
+
+def exit_event_handler():
+    print("First exit: kernel booted")
+    yield False  # gem5 is now executing systemd startup
+    print("Second exit: Started `after_boot.sh` script")
+    # The after_boot.sh script is executed after the kernel and systemd have
+    # booted.
+    yield False  # gem5 is now executing the `after_boot.sh` script
+    print("Third exit: Finished `after_boot.sh` script")
+    # The after_boot.sh script will run a script if it is passed via
+    # m5 readfile. This is the last exit event before the simulation exits.
+    yield True
+
+
+# We define the system with the aforementioned system defined.
+simulator = Simulator(
+    board=board,
+    on_exit_event={
+        ExitEvent.EXIT: exit_event_handler(),
+    },
+)
+
+simulator.run()
diff --git a/src/python/SConscript b/src/python/SConscript
index 3aed9f03e3..afe786536c 100644
--- a/src/python/SConscript
+++ b/src/python/SConscript
@@ -280,6 +280,7 @@ PySource('gem5.components.processors',
 PySource('gem5.prebuilt', 'gem5/prebuilt/__init__.py')
 PySource('gem5.prebuilt.demo', 'gem5/prebuilt/demo/__init__.py')
 PySource('gem5.prebuilt.demo', 'gem5/prebuilt/demo/x86_demo_board.py')
+PySource('gem5.prebuilt.demo', 'gem5/prebuilt/demo/arm_demo_board.py')
 PySource('gem5.prebuilt.riscvmatched',
     'gem5/prebuilt/riscvmatched/__init__.py')
 PySource('gem5.prebuilt.riscvmatched',
diff --git a/src/python/gem5/prebuilt/demo/arm_demo_board.py b/src/python/gem5/prebuilt/demo/arm_demo_board.py
new file mode 100644
index 0000000000..dfbc6d89e2
--- /dev/null
+++ b/src/python/gem5/prebuilt/demo/arm_demo_board.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.objects import (
+    ArmDefaultRelease,
+    VExpress_GEM5_Foundation,
+    VExpress_GEM5_V1,
+)
+from m5.util import warn
+
+from ...components.boards.arm_board import ArmBoard
+from ...components.cachehierarchies.classic.private_l1_shared_l2_cache_hierarchy import (
+    PrivateL1SharedL2CacheHierarchy,
+)
+from ...components.memory import DualChannelDDR4_2400
+from ...components.processors.cpu_types import CPUTypes
+from ...components.processors.simple_processor import SimpleProcessor
+from ...isas import ISA
+from ...utils.requires import requires
+
+
+class ArmDemoBoard(ArmBoard):
+    """
+    This prebuilt ARM board is used for demonstration purposes. It simulates an
+    ARM 3GHz dual-core system with a 4GiB DDR4_2400 memory system. It uses
+    a PrivateL1SharedL2CacheHierarchy with l1d and l1i caches set to 64KiB and
+    l2 shared cache set to 8MiB
+
+    **DISCLAIMER**: This board is solely for demonstration purposes. This board
+    is not known to be representative of any real-world system or produce
+    reliable statistical results.
+    """
+
+    def __init__(self, use_kvm: bool = False) -> None:
+        """
+        :param use_kvm: If True, the board will use a SimpleProcessor
+            with cpu type of CPUTypes.KVM. If False, the board will use a SimpleProcessor with
+            a cpu type of CPUTypes.TIMING.
+        """
+        requires(
+            isa_required=ISA.ARM,
+        )
+
+        warn(
+            "The ARMDemoBoard is solely for demonstration purposes. "
+            "This board is not known to be be representative of any "
+            "real-world system. Use with caution."
+        )
+        cache_hierarchy = PrivateL1SharedL2CacheHierarchy(
+            l1d_size="64KiB", l1i_size="64KiB", l2_size="8MiB"
+        )
+
+        # Note: Normally a system with these specification would have 1
+        # GiB for memory but because some benchmarks would not run with
+        # 1 GiB of memory so we have set it to 4 GiB.
+        memory = DualChannelDDR4_2400(size="4GiB")
+
+        if use_kvm:
+            processor = SimpleProcessor(
+                cpu_type=CPUTypes.KVM, num_cores=2, isa=ISA.ARM
+            )
+            # The ArmBoard requires a `release` to be specified. This adds all the
+            # extensions or features to the system. We are setting this to for_kvm()
+            # to enable KVM simulation.
+            release = ArmDefaultRelease.for_kvm()
+
+            # The platform sets up the memory ranges of all the on-chip and off-chip
+            # devices present on the ARM system. ARM KVM only works with VExpress_GEM5_V1
+            # on the ArmBoard at the moment.
+            platform = VExpress_GEM5_V1()
+
+        else:
+            processor = SimpleProcessor(
+                cpu_type=CPUTypes.TIMING, num_cores=2, isa=ISA.ARM
+            )
+            release = ArmDefaultRelease()
+
+            # The platform sets up the memory ranges of all the on-chip and off-chip
+            # devices present on the ARM system.
+            platform = VExpress_GEM5_Foundation()
+
+        super().__init__(
+            clk_freq="3GHz",
+            processor=processor,
+            memory=memory,
+            cache_hierarchy=cache_hierarchy,
+            release=release,
+            platform=platform,
+        )

From ae0cee66ed3c09d25b8e73e349d9b0800b3c30da Mon Sep 17 00:00:00 2001
From: Pranith <bobby.prani@gmail.com>
Date: Fri, 18 Oct 2024 05:40:10 -0700
Subject: [PATCH 46/47] systemc: Disable 'overloaded-virtual' warn for clang
 (#1662)

We need to extend the warning disable even for clang compiler.

Fixes #1658
---
 src/systemc/ext/core/sc_export.hh                      | 2 +-
 src/systemc/ext/core/sc_port.hh                        | 2 +-
 src/systemc/ext/tlm_core/2/sockets/initiator_socket.hh | 2 +-
 src/systemc/ext/tlm_core/2/sockets/target_socket.hh    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/systemc/ext/core/sc_export.hh b/src/systemc/ext/core/sc_export.hh
index f231968e77..913cd75a9d 100644
--- a/src/systemc/ext/core/sc_export.hh
+++ b/src/systemc/ext/core/sc_export.hh
@@ -78,7 +78,7 @@ class sc_export : public sc_export_base
  * code is correct).
  * Please check section 9.3 of SystemC 2.3.1 release note for more details.
  */
-#if defined(__GNUC__) && (__GNUC__ >= 13)
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 13))
 #pragma GCC diagnostic ignored "-Woverloaded-virtual"
 #endif
     void operator () (IF &i) { bind(i); }
diff --git a/src/systemc/ext/core/sc_port.hh b/src/systemc/ext/core/sc_port.hh
index bf00cb9361..346eb430b1 100644
--- a/src/systemc/ext/core/sc_port.hh
+++ b/src/systemc/ext/core/sc_port.hh
@@ -126,7 +126,7 @@ class sc_port_b : public sc_port_base
  * code is correct).
  * Please check section 9.3 of SystemC 2.3.1 release note for more details.
  */
-#if defined(__GNUC__) && (__GNUC__ >= 13)
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 13))
 #pragma GCC diagnostic ignored "-Woverloaded-virtual"
 #endif
     void operator () (IF &i) { bind(i); }
diff --git a/src/systemc/ext/tlm_core/2/sockets/initiator_socket.hh b/src/systemc/ext/tlm_core/2/sockets/initiator_socket.hh
index 2bb97f7945..d4cf3849e3 100644
--- a/src/systemc/ext/tlm_core/2/sockets/initiator_socket.hh
+++ b/src/systemc/ext/tlm_core/2/sockets/initiator_socket.hh
@@ -105,7 +105,7 @@ class tlm_base_initiator_socket :
  * code is correct).
  * Please check section 9.3 of SystemC 2.3.1 release note for more details.
  */
-#if defined(__GNUC__) && (__GNUC__ >= 13)
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 13))
 #pragma GCC diagnostic ignored "-Woverloaded-virtual"
 #endif
     virtual void
diff --git a/src/systemc/ext/tlm_core/2/sockets/target_socket.hh b/src/systemc/ext/tlm_core/2/sockets/target_socket.hh
index 3f5cb98ae4..a3d3026614 100644
--- a/src/systemc/ext/tlm_core/2/sockets/target_socket.hh
+++ b/src/systemc/ext/tlm_core/2/sockets/target_socket.hh
@@ -100,7 +100,7 @@ class tlm_base_target_socket :
  * code is correct).
  * Please check section 9.3 of SystemC 2.3.1 release note for more details.
  */
-#if defined(__GNUC__) && (__GNUC__ >= 13)
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 13))
 #pragma GCC diagnostic ignored "-Woverloaded-virtual"
 #endif
     virtual void

From 3fc6cc7763e67ebab76358845b06b94060dd19be Mon Sep 17 00:00:00 2001
From: handsomeliu-google <handsomeliu@google.com>
Date: Fri, 18 Oct 2024 20:41:05 +0800
Subject: [PATCH 47/47] sim: Make SignalSinkPort::set virtual (#1679)

We are implementing derived classes of SignalSinkPort that does some
additional logic after it's triggered (set() invoked by SignalSourcePort
peer), and before executing the callback that a device provides (in
onChange_). The logic is like additional logging, or providing debugging
features. However, set() itself directly calls the onChange_ callback.

Making the set() virtual could provide the flexibility to achieve this
feature.
---
 src/sim/signal.hh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/sim/signal.hh b/src/sim/signal.hh
index 233de07658..e89fbe0b9f 100644
--- a/src/sim/signal.hh
+++ b/src/sim/signal.hh
@@ -51,12 +51,11 @@ class SignalSinkPort : public Port
     SignalSourcePort<State> *_source = nullptr;
 
     State _state = {};
-    OnChangeFunc _onChange;
 
   protected:
     // if bypass_on_change is specified true, it will not call the _onChange
     // function. Only _state will be updated if needed.
-    void
+    virtual void
     set(const State &new_state, const bool bypass_on_change = false)
     {
         if (new_state == _state)
@@ -67,6 +66,8 @@ class SignalSinkPort : public Port
             _onChange(_state);
     }
 
+    OnChangeFunc _onChange;
+
   public:
     SignalSinkPort(const std::string &_name, PortID _id=InvalidPortID) :
         Port(_name, _id)