diff --git a/.github/workflows/ci-tests.yaml b/.github/workflows/ci-tests.yaml
index 8e828675af..f40f3faff9 100644
--- a/.github/workflows/ci-tests.yaml
+++ b/.github/workflows/ci-tests.yaml
@@ -5,7 +5,7 @@ name: CI Tests
 
 on:
     pull_request:
-        types: [opened, edited, synchronize, ready_for_review]
+        types: [opened, synchronize, ready_for_review]
 
 concurrency:
     group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
@@ -21,17 +21,48 @@ jobs:
             - uses: actions/setup-python@v5
             - uses: pre-commit/action@v3.0.1
 
+    get-date:
+    # We use the date to label caches. A cache is a a "hit" if the date is the
+    # request binary and date are the same as what is stored in the cache.
+    # This essentially means the first job to run on a given day for a given
+    # binary will always be a "miss" and will have to build the binary then
+    # upload it as that day's binary to upload. While this isn't the most
+    # efficient way to do this, the alternative was to run take a hash of the
+    # `src` directory contents and use it as a hash. We found there to be bugs
+    # with the hash function where this task would timeout. This approach is
+    # simple, works, and still provides some level of caching.
+        runs-on: ubuntu-latest
+        outputs:
+            date: ${{ steps.date.outputs.date }}
+        steps:
+            - name: Get the current date
+              id: date
+              run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+
     unittests-all-opt:
         runs-on: [self-hosted, linux, x64]
         if: github.event.pull_request.draft == false
         container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
-        needs: [pre-commit] # only runs if pre-commit passes.
+        needs: [pre-commit, get-date] # only runs if pre-commit passes.
         timeout-minutes: 60
         steps:
             - uses: actions/checkout@v4
+
+
+            # Restore the cache if available. As this just builds the unittests
+            # we only obtain the cache and do not provide if if is not
+            # available.
+            - name: Cache build/ALL
+              uses: actions/cache/restore@v4
+              with:
+                  path: build/ALL
+                  key: testlib-build-all-${{ env.date }}
+                  restore-keys: |
+                      testlib-build-all
+
             - name: CI Unittests
               working-directory: ${{ github.workspace }}
-              run: scons build/ALL/unittests.opt -j $(nproc)
+              run: scons --no-compress-debug build/ALL/unittests.opt -j $(nproc)
             - run: echo "This job's status is ${{ job.status }}."
 
     testlib-quick-matrix:
@@ -83,14 +114,24 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         if: github.event.pull_request.draft == false
         container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
-        needs: [pre-commit, testlib-quick-matrix]
+        needs: [pre-commit, testlib-quick-matrix, get-date]
         strategy:
             matrix:
                 build-target: ${{ fromJson(needs.testlib-quick-matrix.outputs.build-matrix) }}
         steps:
             - uses: actions/checkout@v4
+
+            - name: Cache build/ALL
+              uses: actions/cache@v4
+              if: ${{ endsWith(matrix.build-target, 'build/ALL/gem5.opt') }}
+              with:
+                  path: build/ALL
+                  key: testlib-build-all-${{ env.date }}
+                  restore-keys: |
+                      testlib-build-all
+
             - name: Build gem5
-              run: scons ${{ matrix.build-target }} -j $(nproc)
+              run: scons --no-compress-debug ${{ matrix.build-target }} -j $(nproc)
 
         # Upload the gem5 binary as an artifact.
         # Note: the "achor.txt" file is a hack to make sure the paths are
@@ -199,13 +240,23 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         container: ghcr.io/gem5/gcn-gpu:latest
         timeout-minutes: 180
-        needs: [pre-commit]
+        needs: [pre-commit, get-date]
         steps:
             - uses: actions/checkout@v4
 
+            # Obtain the cache if available. If not available this will upload
+            # this job's instance of the cache.
+            - name: Cache build/VEGA_X86
+              uses: actions/cache@v4
+              with:
+                  path: build/VEGA_X86
+                  key: testlib-build-vega-${{ env.date }}
+                  restore-keys: |
+                      testlib-build-vega
+
             # Build the VEGA_X86/gem5.opt binary.
             - name: Build VEGA_X86/gem5.opt
-              run: scons build/VEGA_X86/gem5.opt -j`nproc`
+              run: scons --no-compress-debug build/VEGA_X86/gem5.opt -j`nproc`
 
             # Run the GPU tests.
             - name: Run Testlib GPU Tests
diff --git a/.github/workflows/compiler-tests.yaml b/.github/workflows/compiler-tests.yaml
index eb570916bc..c44d2d9161 100644
--- a/.github/workflows/compiler-tests.yaml
+++ b/.github/workflows/compiler-tests.yaml
@@ -13,8 +13,8 @@ jobs:
         strategy:
             fail-fast: false
             matrix:
-                image: [gcc-version-13, gcc-version-12, gcc-version-11, gcc-version-10, clang-version-18, clang-version-17, clang-version-16, clang-version-15,
-                    clang-version-14, ubuntu-22.04_all-dependencies, ubuntu-24.04_all-dependencies, ubuntu-24.04_min-dependencies]
+                image: [gcc-version-14, gcc-version-13, gcc-version-12, gcc-version-11, gcc-version-10, clang-version-18, clang-version-17, clang-version-16,
+                    clang-version-15, clang-version-14, ubuntu-22.04_all-dependencies, ubuntu-24.04_all-dependencies, ubuntu-24.04_min-dependencies]
                 opts: [.opt, .fast]
         runs-on: [self-hosted, linux, x64]
         timeout-minutes: 2880 # 48 hours
@@ -32,7 +32,7 @@ jobs:
             matrix:
                 gem5-compilation: [ARM, ARM_MESI_Three_Level, ARM_MESI_Three_Level_HTM, ARM_MOESI_hammer, Garnet_standalone, MIPS, 'NULL', NULL_MESI_Two_Level,
                     NULL_MOESI_CMP_directory, NULL_MOESI_CMP_token, NULL_MOESI_hammer, POWER, RISCV, SPARC, X86, X86_MI_example, X86_MOESI_AMD_Base, VEGA_X86]
-                image: [gcc-version-13, clang-version-18]
+                image: [gcc-version-14, clang-version-18]
                 opts: [.opt]
         runs-on: [self-hosted, linux, x64]
         timeout-minutes: 2880 # 48 hours
diff --git a/.github/workflows/daily-tests.yaml b/.github/workflows/daily-tests.yaml
index 54711ad63d..584cce0d90 100644
--- a/.github/workflows/daily-tests.yaml
+++ b/.github/workflows/daily-tests.yaml
@@ -8,6 +8,14 @@ on:
     workflow_dispatch:
 
 jobs:
+
+    get-date:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Get the current date
+              id: date
+              run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+
   # this builds both unittests.fast and unittests.debug
     unittests-fast-debug:
         strategy:
@@ -16,13 +24,14 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
         timeout-minutes: 60
+        needs: get-date
         steps:
             - uses: actions/checkout@v4
             - name: Cache build/ALL
               uses: actions/cache/restore@v4
               with:
                   path: build/ALL
-                  key: testlib-build-all-${{ hashFiles('src/**') }}
+                  key: testlib-build-all-${{ env.date }}
                   restore-keys: |
                       testlib-build-all
             - name: ALL/unittests.${{ matrix.type }} UnitTests
@@ -38,6 +47,7 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
         timeout-minutes: 1440 # 24 hours for entire matrix to run
+        needs: get-date
         steps:
             - name: Clean runner
               run: rm -rf ./* || true rm -rf ./.??* || true rm -rf ~/.cache || true
@@ -47,13 +57,13 @@ jobs:
               uses: actions/cache@v4
               with:
                   path: build/NULL
-                  key: testlib-build-null-${{ hashFiles('src/**') }}
+                  key: testlib-build-null-${{ env.date }}
 
             - name: Restore build/ALL cache
               uses: actions/cache@v4
               with:
                   path: build/ALL
-                  key: testlib-build-all-${{ hashFiles('src/**') }}
+                  key: testlib-build-all-${{ env.date }}
 
             - name: long ${{ matrix.test-type }} tests
               working-directory: ${{ github.workspace }}/tests
@@ -81,6 +91,7 @@ jobs:
                     gem5-library-example-arm-ubuntu-run-test-ALL-x86_64-opt, gem5-library-example-riscvmatched-hello-ALL-x86_64-opt]
         container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
         timeout-minutes: 1440 # 24 hours
+        needs: get-date
         steps:
             - name: Clean runner
               run: rm -rf ./* || true rm -rf ./.??* || true rm -rf ~/.cache || true
@@ -90,7 +101,7 @@ jobs:
               uses: actions/cache@v4
               with:
                   path: build/ALL
-                  key: testlib-build-all-${{ hashFiles('src/**') }}
+                  key: testlib-build-all-${{ env.date }}
                   restore-keys: |
                       testlib-build-all
 
@@ -113,6 +124,7 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         container: ghcr.io/gem5/gcn-gpu:latest
         timeout-minutes: 720 # 12 hours
+        needs: get-date
 
         steps:
             - uses: actions/checkout@v4
@@ -123,7 +135,7 @@ jobs:
               uses: actions/cache@v4
               with:
                   path: build/VEGA_X86
-                  key: testlib-build-vega-${{ hashFiles('src/**') }}
+                  key: testlib-build-vega-${{ env.date }}
                   restore-keys: |
                       testlib-build-vega
 
diff --git a/.github/workflows/weekly-tests.yaml b/.github/workflows/weekly-tests.yaml
index 7ada70fddb..6baec1fa68 100644
--- a/.github/workflows/weekly-tests.yaml
+++ b/.github/workflows/weekly-tests.yaml
@@ -9,6 +9,13 @@ on:
 
 jobs:
 
+    get-date:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Get the current date
+              id: date
+              run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+
     # start running the very-long tests
     testlib-very-long-tests:
         strategy:
@@ -18,6 +25,7 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
         timeout-minutes: 4320 # 3 days
+        needs: get-date
         steps:
             - name: Clean runner
               run: rm -rf ./* || true rm -rf ./.??* || true rm -rf ~/.cache || true
@@ -27,7 +35,7 @@ jobs:
               uses: actions/cache@v4
               with:
                   path: build/ALL
-                  key: testlib-build-all-${{ hashFiles('src/**') }}
+                  key: testlib-build-all-${{ env.date }}
                   restore-keys: |
                       testlib-build-all
 
@@ -49,6 +57,7 @@ jobs:
         runs-on: [self-hosted, linux, x64]
         container: ghcr.io/gem5/gcn-gpu:latest
         timeout-minutes: 4320 # 3 days
+        needs: get-date
 
         steps:
             - uses: actions/checkout@v4
@@ -59,7 +68,7 @@ jobs:
               uses: actions/cache@v4
               with:
                   path: build/VEGA_X86
-                  key: testlib-build-vega-${{ hashFiles('src/**') }}
+                  key: testlib-build-vega-${{ env.date }}
                   restore-keys: |
                       testlib-build-vega
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7e17adca7f..03e39a3639 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -49,11 +49,11 @@ exclude: |
       tests/.*/ref/.*
     )$
 
-default_stages: [commit]
+default_stages: [pre-commit]
 
 repos:
     - repo: https://github.com/pre-commit/pre-commit-hooks
-      rev: v4.5.0
+      rev: v5.0.0
       hooks:
           - id: trailing-whitespace
           - id: end-of-file-fixer
@@ -69,7 +69,7 @@ repos:
           - id: destroyed-symlinks
           - id: requirements-txt-fixer
     - repo: https://github.com/PyCQA/isort
-      rev: 5.11.5
+      rev: 5.13.2
       hooks:
           - id: isort
     - repo: https://github.com/jumanjihouse/pre-commit-hook-yamlfmt
@@ -77,11 +77,11 @@ repos:
       hooks:
           - id: yamlfmt
     - repo: https://github.com/psf/black
-      rev: 23.9.1
+      rev: 24.10.0
       hooks:
           - id: black
     - repo: https://github.com/asottile/pyupgrade
-      rev: v3.14.0
+      rev: v3.17.0
       hooks:
           - id: pyupgrade
             # Python 3.8 is the earliest version supported.
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000000..9543f965b7
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "python.analysis.extraPaths": [
+        "src/python",
+        "ext",
+        "tests"
+    ]
+}
diff --git a/configs/common/HMC.py b/configs/common/HMC.py
index 98ff091115..0dfbebb3e5 100644
--- a/configs/common/HMC.py
+++ b/configs/common/HMC.py
@@ -568,9 +568,9 @@ def config_hmc_dev(opt, system, hmc_host):
     # Attach 4 serial link to 4 crossbar/s
     for i in range(opt.num_serial_links):
         if opt.enable_link_monitor:
-            system.hmc_host.seriallink[
-                i
-            ].mem_side_port = system.hmc_dev.lmonitor[i].cpu_side_port
+            system.hmc_host.seriallink[i].mem_side_port = (
+                system.hmc_dev.lmonitor[i].cpu_side_port
+            )
             system.hmc_dev.lmonitor[i].mem_side_port = system.hmc_dev.xbar[
                 i
             ].cpu_side_ports
@@ -613,14 +613,12 @@ def config_hmc_dev(opt, system, hmc_host):
                     ]
 
                     # Connect the bridge between corssbars
-                    system.hmc_dev.xbar[
-                        i
-                    ].mem_side_ports = system.hmc_dev.buffers[
-                        index
-                    ].cpu_side_port
-                    system.hmc_dev.buffers[
-                        index
-                    ].mem_side_port = system.hmc_dev.xbar[j].cpu_side_ports
+                    system.hmc_dev.xbar[i].mem_side_ports = (
+                        system.hmc_dev.buffers[index].cpu_side_port
+                    )
+                    system.hmc_dev.buffers[index].mem_side_port = (
+                        system.hmc_dev.xbar[j].cpu_side_ports
+                    )
                 else:
                     # Don't connect the xbar to itself
                     pass
@@ -629,49 +627,49 @@ def config_hmc_dev(opt, system, hmc_host):
     # can only direct traffic to it local vaults
     if opt.arch == "mixed":
         system.hmc_dev.buffer30 = Bridge(ranges=system.mem_ranges[0:4])
-        system.hmc_dev.xbar[
-            3
-        ].mem_side_ports = system.hmc_dev.buffer30.cpu_side_port
+        system.hmc_dev.xbar[3].mem_side_ports = (
+            system.hmc_dev.buffer30.cpu_side_port
+        )
         system.hmc_dev.buffer30.mem_side_port = system.hmc_dev.xbar[
             0
         ].cpu_side_ports
 
         system.hmc_dev.buffer31 = Bridge(ranges=system.mem_ranges[4:8])
-        system.hmc_dev.xbar[
-            3
-        ].mem_side_ports = system.hmc_dev.buffer31.cpu_side_port
+        system.hmc_dev.xbar[3].mem_side_ports = (
+            system.hmc_dev.buffer31.cpu_side_port
+        )
         system.hmc_dev.buffer31.mem_side_port = system.hmc_dev.xbar[
             1
         ].cpu_side_ports
 
         system.hmc_dev.buffer32 = Bridge(ranges=system.mem_ranges[8:12])
-        system.hmc_dev.xbar[
-            3
-        ].mem_side_ports = system.hmc_dev.buffer32.cpu_side_port
+        system.hmc_dev.xbar[3].mem_side_ports = (
+            system.hmc_dev.buffer32.cpu_side_port
+        )
         system.hmc_dev.buffer32.mem_side_port = system.hmc_dev.xbar[
             2
         ].cpu_side_ports
 
         system.hmc_dev.buffer20 = Bridge(ranges=system.mem_ranges[0:4])
-        system.hmc_dev.xbar[
-            2
-        ].mem_side_ports = system.hmc_dev.buffer20.cpu_side_port
+        system.hmc_dev.xbar[2].mem_side_ports = (
+            system.hmc_dev.buffer20.cpu_side_port
+        )
         system.hmc_dev.buffer20.mem_side_port = system.hmc_dev.xbar[
             0
         ].cpu_side_ports
 
         system.hmc_dev.buffer21 = Bridge(ranges=system.mem_ranges[4:8])
-        system.hmc_dev.xbar[
-            2
-        ].mem_side_ports = system.hmc_dev.buffer21.cpu_side_port
+        system.hmc_dev.xbar[2].mem_side_ports = (
+            system.hmc_dev.buffer21.cpu_side_port
+        )
         system.hmc_dev.buffer21.mem_side_port = system.hmc_dev.xbar[
             1
         ].cpu_side_ports
 
         system.hmc_dev.buffer23 = Bridge(ranges=system.mem_ranges[12:16])
-        system.hmc_dev.xbar[
-            2
-        ].mem_side_ports = system.hmc_dev.buffer23.cpu_side_port
+        system.hmc_dev.xbar[2].mem_side_ports = (
+            system.hmc_dev.buffer23.cpu_side_port
+        )
         system.hmc_dev.buffer23.mem_side_port = system.hmc_dev.xbar[
             3
         ].cpu_side_ports
diff --git a/configs/common/Simulation.py b/configs/common/Simulation.py
index 3e332d76b4..be928651ae 100644
--- a/configs/common/Simulation.py
+++ b/configs/common/Simulation.py
@@ -541,9 +541,9 @@ def run(options, root, testsys, cpu_class):
                 IndirectBPClass = ObjectList.indirect_bp_list.get(
                     options.indirect_bp_type
                 )
-                switch_cpus[
-                    i
-                ].branchPred.indirectBranchPred = IndirectBPClass()
+                switch_cpus[i].branchPred.indirectBranchPred = (
+                    IndirectBPClass()
+                )
             switch_cpus[i].createThreads()
 
         # If elastic tracing is enabled attach the elastic trace probe
diff --git a/configs/common/cores/arm/HPI.py b/configs/common/cores/arm/HPI.py
index 826d4e19f4..36aa64eca5 100644
--- a/configs/common/cores/arm/HPI.py
+++ b/configs/common/cores/arm/HPI.py
@@ -1683,6 +1683,15 @@ class HPI_MMU(ArmMMU):
 class HPI_BTB(SimpleBTB):
     numEntries = 128
     tagBits = 18
+    associativity = 1
+    instShiftAmt = 2
+    btbReplPolicy = LRURP()
+    btbIndexingPolicy = BTBSetAssociative(
+        num_entries=Parent.numEntries,
+        set_shift=Parent.instShiftAmt,
+        assoc=Parent.associativity,
+        tag_bits=Parent.tagBits,
+    )
 
 
 class HPI_BP(TournamentBP):
diff --git a/configs/common/cores/arm/O3_ARM_v7a.py b/configs/common/cores/arm/O3_ARM_v7a.py
index 45bb391bb1..ee42c3c062 100644
--- a/configs/common/cores/arm/O3_ARM_v7a.py
+++ b/configs/common/cores/arm/O3_ARM_v7a.py
@@ -111,6 +111,15 @@ class O3_ARM_v7a_FUP(FUPool):
 class O3_ARM_v7a_BTB(SimpleBTB):
     numEntries = 2048
     tagBits = 18
+    associativity = 1
+    instShiftAmt = 2
+    btbReplPolicy = LRURP()
+    btbIndexingPolicy = BTBSetAssociative(
+        num_entries=Parent.numEntries,
+        set_shift=Parent.instShiftAmt,
+        assoc=Parent.associativity,
+        tag_bits=Parent.tagBits,
+    )
 
 
 # Bi-Mode Branch Predictor
diff --git a/configs/common/cores/arm/ex5_big.py b/configs/common/cores/arm/ex5_big.py
index f3b55fd3a8..8ea04aa5f7 100644
--- a/configs/common/cores/arm/ex5_big.py
+++ b/configs/common/cores/arm/ex5_big.py
@@ -108,6 +108,15 @@ class ex5_big_FUP(FUPool):
 class ex5_big_BTB(SimpleBTB):
     numEntries = 4096
     tagBits = 18
+    associativity = 1
+    instShiftAmt = 2
+    btbReplPolicy = LRURP()
+    btbIndexingPolicy = BTBSetAssociative(
+        num_entries=Parent.numEntries,
+        set_shift=Parent.instShiftAmt,
+        assoc=Parent.associativity,
+        tag_bits=Parent.tagBits,
+    )
 
 
 # Bi-Mode Branch Predictor
diff --git a/configs/deprecated/example/fs.py b/configs/deprecated/example/fs.py
index 7426c47c7e..df77b6d830 100644
--- a/configs/deprecated/example/fs.py
+++ b/configs/deprecated/example/fs.py
@@ -213,9 +213,9 @@ def build_test_system(np, isa: ISA):
                     IndirectBPClass = ObjectList.indirect_bp_list.get(
                         args.indirect_bp_type
                     )
-                    test_sys.cpu[
-                        i
-                    ].branchPred.indirectBranchPred = IndirectBPClass()
+                    test_sys.cpu[i].branchPred.indirectBranchPred = (
+                        IndirectBPClass()
+                    )
             test_sys.cpu[i].createThreads()
 
         # If elastic tracing is enabled when not restoring from checkpoint and
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index 1ae6edf391..d512594afe 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -935,9 +935,9 @@ gpu_port_idx = gpu_port_idx - args.num_cp * 2
 token_port_idx = 0
 for i in range(len(system.ruby._cpu_ports)):
     if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer):
-        system.cpu[shader_idx].CUs[
-            token_port_idx
-        ].gmTokenPort = system.ruby._cpu_ports[i].gmTokenPort
+        system.cpu[shader_idx].CUs[token_port_idx].gmTokenPort = (
+            system.ruby._cpu_ports[i].gmTokenPort
+        )
         token_port_idx += 1
 
 wavefront_size = args.wf_size
diff --git a/configs/example/gem5_library/arm-demo-ubuntu-run.py b/configs/example/gem5_library/arm-demo-ubuntu-run.py
new file mode 100644
index 0000000000..9b39c34330
--- /dev/null
+++ b/configs/example/gem5_library/arm-demo-ubuntu-run.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+This script further shows an example of booting an ARM based full system Ubuntu
+disk image. This simulation boots the disk image using the ArmDemoBoard.
+
+Usage
+-----
+
+```bash
+scons build/ARM/gem5.opt -j $(nproc)
+./build/ARM/gem5.opt configs/example/gem5_library/arm-demo-ubuntu-run.py
+```
+"""
+import argparse
+
+from gem5.isas import ISA
+from gem5.prebuilt.demo.arm_demo_board import ArmDemoBoard
+from gem5.resources.resource import obtain_resource
+from gem5.simulate.exit_event import ExitEvent
+from gem5.simulate.simulator import Simulator
+from gem5.utils.requires import requires
+
+# This runs a check to ensure the gem5 binary interpreting this file is compiled to include the ARM ISA.
+requires(isa_required=ISA.ARM)
+
+parser = argparse.ArgumentParser(
+    description="An example configuration script to run the ArmDemoBoard."
+)
+
+parser.add_argument(
+    "--use-kvm",
+    action="store_true",
+    help="Use KVM cores instead of Timing.",
+)
+args = parser.parse_args()
+
+board = ArmDemoBoard(use_kvm=args.use_kvm)
+
+board.set_workload(
+    obtain_resource(
+        "arm-ubuntu-24.04-boot-with-systemd", resource_version="2.0.0"
+    )
+)
+
+
+def exit_event_handler():
+    print("First exit: kernel booted")
+    yield False  # gem5 is now executing systemd startup
+    print("Second exit: Started `after_boot.sh` script")
+    # The after_boot.sh script is executed after the kernel and systemd have
+    # booted.
+    yield False  # gem5 is now executing the `after_boot.sh` script
+    print("Third exit: Finished `after_boot.sh` script")
+    # The after_boot.sh script will run a script if it is passed via
+    # m5 readfile. This is the last exit event before the simulation exits.
+    yield True
+
+
+# We define the system with the aforementioned system defined.
+simulator = Simulator(
+    board=board,
+    on_exit_event={
+        ExitEvent.EXIT: exit_event_handler(),
+    },
+)
+
+simulator.run()
diff --git a/configs/example/gem5_library/riscv-rvv-example.py b/configs/example/gem5_library/riscv-rvv-example.py
new file mode 100755
index 0000000000..57a6fd7afd
--- /dev/null
+++ b/configs/example/gem5_library/riscv-rvv-example.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2024 Barcelona Supercomputing Center
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+This script demonstrates how to run RISC-V vector-enabled binaries in SE mode
+with gem5. It accepts the number of CORES, VLEN, and ELEN as optional
+parameters, as well as the resource name to run. If no resource name is
+provided, a list of available resources will be displayed. If one is given the
+simulation will then execute the specified resource binary with the selected
+parameters until completion.
+
+
+Usage
+-----
+
+# Compile gem5 for RISC-V
+scons build/RISCV/gem5.opt
+
+# Run the simulation
+./build/RISCV/gem5.opt configs/example/gem5_library/riscv-rvv-example.py \
+    [-c CORES] [-v VLEN] [-e ELEN] <resource>
+
+"""
+
+import argparse
+
+from m5.objects import RiscvO3CPU
+
+from gem5.components.boards.simple_board import SimpleBoard
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
+    PrivateL1PrivateL2CacheHierarchy,
+)
+from gem5.components.memory import SingleChannelDDR3_1600
+from gem5.components.processors.base_cpu_core import BaseCPUCore
+from gem5.components.processors.base_cpu_processor import BaseCPUProcessor
+from gem5.isas import ISA
+from gem5.resources.resource import obtain_resource
+from gem5.simulate.simulator import Simulator
+from gem5.utils.requires import requires
+
+
+class RVVCore(BaseCPUCore):
+    def __init__(self, elen, vlen, cpu_id):
+        super().__init__(core=RiscvO3CPU(cpu_id=cpu_id), isa=ISA.RISCV)
+        self.core.isa[0].elen = elen
+        self.core.isa[0].vlen = vlen
+
+
+requires(isa_required=ISA.RISCV)
+
+resources = [
+    "rvv-branch",
+    "rvv-index",
+    "rvv-matmul",
+    "rvv-memcpy",
+    "rvv-reduce",
+    "rvv-saxpy",
+    "rvv-sgemm",
+    "rvv-strcmp",
+    "rvv-strcpy",
+    "rvv-strlen",
+    "rvv-strlen-fault",
+    "rvv-strncpy",
+]
+
+parser = argparse.ArgumentParser()
+parser.add_argument("resource", type=str, choices=resources)
+parser.add_argument("-c", "--cores", required=False, type=int, default=1)
+parser.add_argument("-v", "--vlen", required=False, type=int, default=256)
+parser.add_argument("-e", "--elen", required=False, type=int, default=64)
+
+args = parser.parse_args()
+
+cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+    l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB"
+)
+
+memory = SingleChannelDDR3_1600()
+
+processor = BaseCPUProcessor(
+    cores=[RVVCore(args.elen, args.vlen, i) for i in range(args.cores)]
+)
+
+board = SimpleBoard(
+    clk_freq="1GHz",
+    processor=processor,
+    memory=memory,
+    cache_hierarchy=cache_hierarchy,
+)
+
+binary = obtain_resource(args.resource)
+board.set_se_binary_workload(binary)
+
+simulator = Simulator(board=board, full_system=False)
+print("Beginning simulation!")
+simulator.run()
diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index eb95526509..8cb29b07ba 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 Advanced Micro Devices, Inc.
+# Copyright (c) 2021-2024 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -82,10 +82,6 @@ def addRunFSOptions(parser):
         help="The second disk image to mount (/dev/sdb)",
     )
     parser.add_argument("--kernel", default=None, help="Linux kernel to boot")
-    parser.add_argument("--gpu-rom", default=None, help="GPU BIOS to load")
-    parser.add_argument(
-        "--gpu-mmio-trace", default=None, help="GPU MMIO trace to load"
-    )
     parser.add_argument(
         "--checkpoint-before-mmios",
         default=False,
@@ -241,16 +237,6 @@ def runGpuFSSystem(args):
         math.ceil(float(n_cu) / args.cu_per_scalar_cache)
     )
 
-    # Verify MMIO trace is valid. This is only needed for Vega10 simulations.
-    # The md5sum refers to the md5sum of the Vega10 MMIO hardware trace in
-    # the gem5-resources repository. By checking it here, we avoid potential
-    # errors that would cause the driver not to load and simulations to fail.
-    if args.gpu_device == "Vega10":
-        mmio_file = open(args.gpu_mmio_trace, "rb")
-        mmio_md5 = hashlib.md5(mmio_file.read()).hexdigest()
-        if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
-            m5.util.panic("MMIO file does not match gem5 resources")
-
     system = makeGpuFSSystem(args)
 
     root = Root(
diff --git a/configs/example/gpufs/system/amdgpu.py b/configs/example/gpufs/system/amdgpu.py
index dedbcc9324..bdeda9024a 100644
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -176,8 +176,6 @@ def createGPU(system, args):
 def connectGPU(system, args):
     system.pc.south_bridge.gpu = AMDGPUDevice(pci_func=0, pci_dev=8, pci_bus=0)
 
-    system.pc.south_bridge.gpu.trace_file = args.gpu_mmio_trace
-    system.pc.south_bridge.gpu.rom_binary = args.gpu_rom
     system.pc.south_bridge.gpu.checkpoint_before_mmios = (
         args.checkpoint_before_mmios
     )
diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py
index 1ce261d764..b650659303 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -336,9 +336,9 @@ def makeGpuFSSystem(args):
     token_port_idx = 0
     for i in range(len(system.ruby._cpu_ports)):
         if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer):
-            system.cpu[shader_idx].CUs[
-                token_port_idx
-            ].gmTokenPort = system.ruby._cpu_ports[i].gmTokenPort
+            system.cpu[shader_idx].CUs[token_port_idx].gmTokenPort = (
+                system.ruby._cpu_ports[i].gmTokenPort
+            )
             token_port_idx += 1
 
     wavefront_size = args.wf_size
@@ -346,9 +346,9 @@ def makeGpuFSSystem(args):
         # The pipeline issues wavefront_size number of uncoalesced requests
         # in one GPU issue cycle. Hence wavefront_size mem ports.
         for j in range(wavefront_size):
-            system.cpu[shader_idx].CUs[i].memory_port[
-                j
-            ] = system.ruby._cpu_ports[gpu_port_idx].in_ports[j]
+            system.cpu[shader_idx].CUs[i].memory_port[j] = (
+                system.ruby._cpu_ports[gpu_port_idx].in_ports[j]
+            )
         gpu_port_idx += 1
 
     for i in range(args.num_compute_units):
diff --git a/configs/example/lupv/run_lupv.py b/configs/example/lupv/run_lupv.py
index f6f938b16c..57bf6ca6b8 100644
--- a/configs/example/lupv/run_lupv.py
+++ b/configs/example/lupv/run_lupv.py
@@ -110,8 +110,7 @@ board.set_kernel_disk_workload(
 # Begin running of the simulation.
 print("Running with ISA: " + processor.get_isa().name)
 print()
-root = Root(full_system=True, system=board)
-board._pre_instantiate()
+root = board._pre_instantiate()
 m5.instantiate()
 print("Beginning simulation!")
 
diff --git a/configs/example/read_config.py b/configs/example/read_config.py
index 27e23b69ee..9f86c3af49 100644
--- a/configs/example/read_config.py
+++ b/configs/example/read_config.py
@@ -250,9 +250,11 @@ class ConfigManager:
                         obj,
                         param_name,
                         [
-                            self.objects_by_name[name]
-                            if name != "Null"
-                            else m5.params.NULL
+                            (
+                                self.objects_by_name[name]
+                                if name != "Null"
+                                else m5.params.NULL
+                            )
                             for name in param_values
                         ],
                     )
diff --git a/configs/example/ruby_gpu_random_test.py b/configs/example/ruby_gpu_random_test.py
index bfcd2c953d..eb7dd3acbd 100644
--- a/configs/example/ruby_gpu_random_test.py
+++ b/configs/example/ruby_gpu_random_test.py
@@ -371,6 +371,7 @@ for dma_idx in range(n_DMAs):
             num_lanes=1,
             clk_domain=thread_clock,
             deadlock_threshold=tester_deadlock_threshold,
+            cache_line_size=system.cache_line_size,
         )
     )
     g_thread_idx += 1
@@ -393,6 +394,7 @@ for cu_idx in range(n_CUs):
                 num_lanes=args.wf_size,
                 clk_domain=thread_clock,
                 deadlock_threshold=tester_deadlock_threshold,
+                cache_line_size=system.cache_line_size,
             )
         )
         g_thread_idx += 1
diff --git a/configs/learning_gem5/part3/msi_caches.py b/configs/learning_gem5/part3/msi_caches.py
index c198662c5e..b719c7ab60 100644
--- a/configs/learning_gem5/part3/msi_caches.py
+++ b/configs/learning_gem5/part3/msi_caches.py
@@ -84,6 +84,7 @@ class MyCacheSystem(RubySystem):
                 # I/D cache is combined and grab from ctrl
                 dcache=self.controllers[i].cacheMemory,
                 clk_domain=self.controllers[i].clk_domain,
+                ruby_system=self,
             )
             for i in range(len(cpus))
         ]
@@ -191,7 +192,9 @@ class DirController(Directory_Controller):
         self.version = self.versionCount()
         self.addr_ranges = ranges
         self.ruby_system = ruby_system
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
         # Connect this directory to the memory side.
         self.memory = mem_ctrls[0].port
         self.connectQueues(ruby_system)
diff --git a/configs/learning_gem5/part3/ruby_caches_MI_example.py b/configs/learning_gem5/part3/ruby_caches_MI_example.py
index baee120bb9..583041a674 100644
--- a/configs/learning_gem5/part3/ruby_caches_MI_example.py
+++ b/configs/learning_gem5/part3/ruby_caches_MI_example.py
@@ -84,6 +84,7 @@ class MyCacheSystem(RubySystem):
                 # I/D cache is combined and grab from ctrl
                 dcache=self.controllers[i].cacheMemory,
                 clk_domain=self.controllers[i].clk_domain,
+                ruby_system=self,
             )
             for i in range(len(cpus))
         ]
@@ -180,7 +181,9 @@ class DirController(Directory_Controller):
         self.version = self.versionCount()
         self.addr_ranges = ranges
         self.ruby_system = ruby_system
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
         # Connect this directory to the memory side.
         self.memory = mem_ctrls[0].port
         self.connectQueues(ruby_system)
diff --git a/configs/learning_gem5/part3/test_caches.py b/configs/learning_gem5/part3/test_caches.py
index 4e8e8febda..be2d46253e 100644
--- a/configs/learning_gem5/part3/test_caches.py
+++ b/configs/learning_gem5/part3/test_caches.py
@@ -79,6 +79,7 @@ class TestCacheSystem(RubySystem):
                 # I/D cache is combined and grab from ctrl
                 dcache=self.controllers[i].cacheMemory,
                 clk_domain=self.clk_domain,
+                ruby_system=self,
             )
             for i in range(num_testers)
         ]
diff --git a/configs/ruby/AMD_Base_Constructor.py b/configs/ruby/AMD_Base_Constructor.py
index ff4246a7e0..7d40862517 100644
--- a/configs/ruby/AMD_Base_Constructor.py
+++ b/configs/ruby/AMD_Base_Constructor.py
@@ -84,14 +84,14 @@ class CPCntrl(AMD_Base_Controller, CntrlBase):
         self.L2cache = L2Cache()
         self.L2cache.create(options.l2_size, options.l2_assoc, options)
 
-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
         self.sequencer.version = self.seqCount()
         self.sequencer.dcache = self.L1D0cache
         self.sequencer.ruby_system = ruby_system
         self.sequencer.coreid = 0
         self.sequencer.is_cpu_sequencer = True
 
-        self.sequencer1 = RubySequencer()
+        self.sequencer1 = RubySequencer(ruby_system=ruby_system)
         self.sequencer1.version = self.seqCount()
         self.sequencer1.dcache = self.L1D1cache
         self.sequencer1.ruby_system = ruby_system
diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py
index 313d1d514a..15108bb674 100644
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -114,14 +114,14 @@ class CPCntrl(CorePair_Controller, CntrlBase):
         self.L2cache = L2Cache()
         self.L2cache.create(options.l2_size, options.l2_assoc, options)
 
-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
         self.sequencer.version = self.seqCount()
         self.sequencer.dcache = self.L1D0cache
         self.sequencer.ruby_system = ruby_system
         self.sequencer.coreid = 0
         self.sequencer.is_cpu_sequencer = True
 
-        self.sequencer1 = RubySequencer()
+        self.sequencer1 = RubySequencer(ruby_system=ruby_system)
         self.sequencer1.version = self.seqCount()
         self.sequencer1.dcache = self.L1D1cache
         self.sequencer1.ruby_system = ruby_system
@@ -169,7 +169,7 @@ class TCPCntrl(TCP_Controller, CntrlBase):
         # TCP_Controller inherits this from RubyController
         self.mandatory_queue_latency = options.mandatory_queue_latency
 
-        self.coalescer = VIPERCoalescer()
+        self.coalescer = VIPERCoalescer(ruby_system=ruby_system)
         self.coalescer.version = self.seqCount()
         self.coalescer.icache = self.L1cache
         self.coalescer.dcache = self.L1cache
@@ -182,7 +182,7 @@ class TCPCntrl(TCP_Controller, CntrlBase):
             options.max_coalesces_per_cycle
         )
 
-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
         self.sequencer.version = self.seqCount()
         self.sequencer.dcache = self.L1cache
         self.sequencer.ruby_system = ruby_system
@@ -211,7 +211,7 @@ class TCPCntrl(TCP_Controller, CntrlBase):
         self.L1cache.create(options)
         self.issue_latency = 1
 
-        self.coalescer = VIPERCoalescer()
+        self.coalescer = VIPERCoalescer(ruby_system=ruby_system)
         self.coalescer.version = self.seqCount()
         self.coalescer.icache = self.L1cache
         self.coalescer.dcache = self.L1cache
@@ -219,7 +219,7 @@ class TCPCntrl(TCP_Controller, CntrlBase):
         self.coalescer.support_inst_reqs = False
         self.coalescer.is_cpu_sequencer = False
 
-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
         self.sequencer.version = self.seqCount()
         self.sequencer.dcache = self.L1cache
         self.sequencer.ruby_system = ruby_system
@@ -387,7 +387,9 @@ class DirCntrl(Directory_Controller, CntrlBase):
         self.response_latency = 30
 
         self.addr_ranges = dir_ranges
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
 
         self.L3CacheMemory = L3Cache()
         self.L3CacheMemory.create(options, ruby_system, system)
@@ -686,7 +688,7 @@ def construct_gpudirs(options, system, ruby_system, network):
         dir_cntrl.addr_ranges = dram_intf.range
 
         # Append
-        exec("system.ruby.gpu_dir_cntrl%d = dir_cntrl" % i)
+        exec("ruby_system.gpu_dir_cntrl%d = dir_cntrl" % i)
         dir_cntrl_nodes.append(dir_cntrl)
         mem_ctrls.append(mem_ctrl)
 
diff --git a/configs/ruby/MESI_Three_Level.py b/configs/ruby/MESI_Three_Level.py
index e0de4e0636..9054fefc01 100644
--- a/configs/ruby/MESI_Three_Level.py
+++ b/configs/ruby/MESI_Three_Level.py
@@ -148,6 +148,7 @@ def create_system(
                 train_misses=5,
                 num_startup_pfs=4,
                 cross_page=True,
+                block_size=options.cacheline_size,
             )
 
             l0_cntrl = L0Cache_Controller(
diff --git a/configs/ruby/MESI_Three_Level_HTM.py b/configs/ruby/MESI_Three_Level_HTM.py
index e6c4e81f91..d7ad3bdc04 100644
--- a/configs/ruby/MESI_Three_Level_HTM.py
+++ b/configs/ruby/MESI_Three_Level_HTM.py
@@ -148,6 +148,7 @@ def create_system(
                 train_misses=5,
                 num_startup_pfs=4,
                 cross_page=True,
+                block_size=options.cacheline_size,
             )
 
             l0_cntrl = L0Cache_Controller(
diff --git a/configs/ruby/MESI_Two_Level.py b/configs/ruby/MESI_Two_Level.py
index 500afbc199..6e1e0b97f3 100644
--- a/configs/ruby/MESI_Two_Level.py
+++ b/configs/ruby/MESI_Two_Level.py
@@ -94,7 +94,7 @@ def create_system(
             is_icache=False,
         )
 
-        prefetcher = RubyPrefetcher()
+        prefetcher = RubyPrefetcher(block_size=options.cacheline_size)
 
         clk_domain = cpus[i].clk_domain
 
diff --git a/configs/ruby/MOESI_AMD_Base.py b/configs/ruby/MOESI_AMD_Base.py
index aeab96a85f..1095defc57 100644
--- a/configs/ruby/MOESI_AMD_Base.py
+++ b/configs/ruby/MOESI_AMD_Base.py
@@ -112,14 +112,14 @@ class CPCntrl(CorePair_Controller, CntrlBase):
         self.L2cache = L2Cache()
         self.L2cache.create(options)
 
-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
         self.sequencer.version = self.seqCount()
         self.sequencer.dcache = self.L1D0cache
         self.sequencer.ruby_system = ruby_system
         self.sequencer.coreid = 0
         self.sequencer.is_cpu_sequencer = True
 
-        self.sequencer1 = RubySequencer()
+        self.sequencer1 = RubySequencer(ruby_system=ruby_system)
         self.sequencer1.version = self.seqCount()
         self.sequencer1.dcache = self.L1D1cache
         self.sequencer1.ruby_system = ruby_system
@@ -194,7 +194,9 @@ class DirCntrl(Directory_Controller, CntrlBase):
         self.response_latency = 30
 
         self.addr_ranges = dir_ranges
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
 
         self.L3CacheMemory = L3Cache()
         self.L3CacheMemory.create(options, ruby_system, system)
diff --git a/configs/ruby/Ruby.py b/configs/ruby/Ruby.py
index e427a39de8..0a6671aa4b 100644
--- a/configs/ruby/Ruby.py
+++ b/configs/ruby/Ruby.py
@@ -308,7 +308,9 @@ def create_directories(options, bootmem, ruby_system, system):
     for i in range(options.num_dirs):
         dir_cntrl = Directory_Controller()
         dir_cntrl.version = i
-        dir_cntrl.directory = RubyDirectoryMemory()
+        dir_cntrl.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
         dir_cntrl.ruby_system = ruby_system
 
         exec("ruby_system.dir_cntrl%d = dir_cntrl" % i)
@@ -316,7 +318,9 @@ def create_directories(options, bootmem, ruby_system, system):
 
     if bootmem is not None:
         rom_dir_cntrl = Directory_Controller()
-        rom_dir_cntrl.directory = RubyDirectoryMemory()
+        rom_dir_cntrl.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
         rom_dir_cntrl.ruby_system = ruby_system
         rom_dir_cntrl.version = i + 1
         rom_dir_cntrl.memory = bootmem.port
diff --git a/src/arch/amdgpu/vega/operand.hh b/src/arch/amdgpu/vega/operand.hh
index 1bb9b43d1f..8e76405562 100644
--- a/src/arch/amdgpu/vega/operand.hh
+++ b/src/arch/amdgpu/vega/operand.hh
@@ -960,11 +960,14 @@ class PackedReg
         uint64_t elem_mask = (1ULL << ELEM_SIZE) - 1;
         value &= elem_mask;
 
+        // Clear the bits where the value goes so that operator| can be used.
         elem_mask <<= qw_lbit;
-        qword &= elem_mask;
+        qword &= ~elem_mask;
 
-        value <<= qw_lbit;
-        qword |= value;
+        // Promote to 64-bit to prevent shifting out of range
+        uint64_t value64 = value;
+        value64 <<= qw_lbit;
+        qword |= value64;
 
         dwords[udw] = uint32_t(qword >> 32);
         dwords[ldw] = uint32_t(qword & mask(32));
diff --git a/src/arch/arm/decoder.cc b/src/arch/arm/decoder.cc
index 9fc4be0e9a..3e898c5a47 100644
--- a/src/arch/arm/decoder.cc
+++ b/src/arch/arm/decoder.cc
@@ -53,8 +53,6 @@ namespace gem5
 namespace ArmISA
 {
 
-GenericISA::BasicDecodeCache<Decoder, ExtMachInst> Decoder::defaultCache;
-
 Decoder::Decoder(const ArmDecoderParams &params)
     : InstDecoder(params, &data),
       dvmEnabled(params.dvm_enabled),
diff --git a/src/arch/arm/decoder.hh b/src/arch/arm/decoder.hh
index 75488b6750..57c29546ae 100644
--- a/src/arch/arm/decoder.hh
+++ b/src/arch/arm/decoder.hh
@@ -94,7 +94,7 @@ class Decoder : public InstDecoder
     enums::DecoderFlavor decoderFlavor;
 
     /// A cache of decoded instruction objects.
-    static GenericISA::BasicDecodeCache<Decoder, ExtMachInst> defaultCache;
+    GenericISA::BasicDecodeCache<Decoder, ExtMachInst> defaultCache;
     friend class GenericISA::BasicDecodeCache<Decoder, ExtMachInst>;
 
     /**
diff --git a/src/arch/arm/faults.hh b/src/arch/arm/faults.hh
index a76439574a..bcd067c284 100644
--- a/src/arch/arm/faults.hh
+++ b/src/arch/arm/faults.hh
@@ -264,7 +264,7 @@ class ArmFaultVals : public ArmFault
     static FaultVals vals;
 
   public:
-    ArmFaultVals<T>(ExtMachInst mach_inst = 0, uint32_t _iss = 0) :
+    ArmFaultVals(ExtMachInst mach_inst = 0, uint32_t _iss = 0) :
         ArmFault(mach_inst, _iss) {}
     FaultName name() const override { return vals.name; }
     FaultOffset offset(ThreadContext *tc) override;
diff --git a/src/arch/arm/isa/formats/fp.isa b/src/arch/arm/isa/formats/fp.isa
index c8508e16e1..45b0985838 100644
--- a/src/arch/arm/isa/formats/fp.isa
+++ b/src/arch/arm/isa/formats/fp.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010-2011, 2016-2019 ARM Limited
+// Copyright (c) 2010-2011, 2016-2019, 2024 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -1891,6 +1891,150 @@ let {{
                         return new NVrsqrteD<uint32_t>(machInst, vd, vm);
                     }
                 }
+            } else if ((b & 0x1c) == 0x00) {
+                if (bits(b, 1)) {
+                    switch(size) {
+                      case 1:
+                        if (q) {
+                            return new NVcvt2uhAQ<uint16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2uhAD<uint16_t>(machInst, vd, vm);
+                        }
+                      case 2:
+                        if (q) {
+                            return new NVcvt2usAQ<uint32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2usAD<uint32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                } else {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2shAQ<int16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2shAD<int16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2ssAQ<int32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2ssAD<int32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                }
+            } else if ((b & 0x1c) == 0x04) {
+                if (bits(b, 1)) {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2uhNQ<uint16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2uhND<uint16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2usNQ<uint32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2usND<uint32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                } else {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2shNQ<int16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2shND<int16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2ssNQ<int32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2ssND<int32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                }
+            } else if ((b & 0x1c) == 0x08) {
+                if (bits(b, 1)) {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2uhPQ<uint16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2uhPD<uint16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2usPQ<uint32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2usPD<uint32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                } else {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2shPQ<int16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2shPD<int16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2ssPQ<int32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2ssPD<int32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                }
+            } else if ((b & 0x1c) == 0x0c) {
+                if (bits(b, 1)) {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2uhMQ<uint16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2uhMD<uint16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2usMQ<uint32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2usMD<uint32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                } else {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2shMQ<int16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2shMD<int16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2ssMQ<int32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2ssMD<int32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                }
             } else {
                 return new Unknown(machInst);
             }
diff --git a/src/arch/arm/isa/insts/neon.isa b/src/arch/arm/isa/insts/neon.isa
index 5f39e48cce..04d6929ae0 100644
--- a/src/arch/arm/isa/insts/neon.isa
+++ b/src/arch/arm/isa/insts/neon.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2010-2011, 2015, 2019 ARM Limited
+// Copyright (c) 2010-2011, 2015, 2019, 2024 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -3579,6 +3579,128 @@ let {{
     '''
     twoRegLongMiscInst("vcvt", "NVcvth2s", "SimdCvtOp", ("uint16_t",), vcvth2sCode)
 
+    vcvthp2hCode = '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
+        float mid = vcvtFpHFpS(fpscr, fpscr.dn, fpscr.ahp, srcElem1);
+        if (flushToZero(mid))
+            fpscr.idc = 1;
+        destElem = vfpFpToFixed<float>(mid, %s, 16, 0, true, %s);
+        __asm__ __volatile__("" :: "m" (destElem));
+        finishVfp(fpscr, state, true);
+        FpscrExc = fpscr;
+    '''
+
+    vcvtahp2uhCode = vcvthp2hCode % ("false", "VfpRoundAway")
+    twoRegMiscInst("vcvta.u16.f16", "NVcvt2uhAD", "SimdCvtOp",
+                   ("uint16_t",), 2, vcvtahp2uhCode)
+    twoRegMiscInst("vcvta.u16.f16", "NVcvt2uhAQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vcvtahp2uhCode)
+
+    vcvtnhp2uhCode = vcvthp2hCode % ("false", "VfpRoundNearest")
+    twoRegMiscInst("vcvtn.u16.f16", "NVcvt2uhND", "SimdCvtOp",
+                   ("uint16_t",), 2, vcvtnhp2uhCode)
+    twoRegMiscInst("vcvtn.u16.f16", "NVcvt2uhNQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vcvtnhp2uhCode)
+
+    vcvtphp2uhCode = vcvthp2hCode % ("false", "VfpRoundUpward")
+    twoRegMiscInst("vcvtp.u16.f16", "NVcvt2uhPD", "SimdCvtOp",
+                   ("uint16_t",), 2, vcvtphp2uhCode)
+    twoRegMiscInst("vcvtp.u16.f16", "NVcvt2uhPQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vcvtphp2uhCode)
+
+    vcvtmhp2uhCode = vcvthp2hCode % ("false", "VfpRoundDown")
+    twoRegMiscInst("vcvtm.u16.f16", "NVcvt2uhMD", "SimdCvtOp",
+                   ("uint16_t",), 2, vcvtmhp2uhCode)
+    twoRegMiscInst("vcvtm.u16.f16", "NVcvt2uhMQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vcvtmhp2uhCode)
+
+    vcvtahp2shCode = vcvthp2hCode % ("true", "VfpRoundAway")
+    twoRegMiscInst("vcvta.s16.f16", "NVcvt2shAD", "SimdCvtOp",
+                   ("int16_t",), 2, vcvtahp2shCode)
+    twoRegMiscInst("vcvta.s16.f16", "NVcvt2shAQ", "SimdCvtOp",
+                   ("int16_t",), 4, vcvtahp2shCode)
+
+    vcvtnhp2shCode = vcvthp2hCode % ("true", "VfpRoundNearest")
+    twoRegMiscInst("vcvtn.s16.f16", "NVcvt2shND", "SimdCvtOp",
+                   ("int16_t",), 2, vcvtnhp2shCode)
+    twoRegMiscInst("vcvtn.s16.f16", "NVcvt2shNQ", "SimdCvtOp",
+                   ("int16_t",), 4, vcvtnhp2shCode)
+
+    vcvtphp2shCode = vcvthp2hCode % ("true", "VfpRoundUpward")
+    twoRegMiscInst("vcvtp.s16.f16", "NVcvt2shPD", "SimdCvtOp",
+                   ("int16_t",), 2, vcvtphp2shCode)
+    twoRegMiscInst("vcvtp.s16.f16", "NVcvt2shPQ", "SimdCvtOp",
+                   ("int16_t",), 4, vcvtphp2shCode)
+
+    vcvtmhp2shCode = vcvthp2hCode % ("true", "VfpRoundDown")
+    twoRegMiscInst("vcvtm.s16.f16", "NVcvt2shMD", "SimdCvtOp",
+                   ("int16_t",), 2, vcvtmhp2shCode)
+    twoRegMiscInst("vcvtm.s16.f16", "NVcvt2shMQ", "SimdCvtOp",
+                   ("int16_t",), 4, vcvtmhp2shCode)
+
+    vcvtsp2sCode = '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
+        float mid = bitsToFp(srcElem1, (float)0.0);
+        if (flushToZero(mid))
+            fpscr.idc = 1;
+        destElem = vfpFpToFixed<float>(mid, %s, 32, 0, true, %s);
+        __asm__ __volatile__("" :: "m" (destElem));
+        finishVfp(fpscr, state, true);
+        FpscrExc = fpscr;
+    '''
+
+    vcvtasp2usCode = vcvtsp2sCode % ("false", "VfpRoundAway")
+    twoRegMiscInst("vcvta.u32.f32", "NVcvt2usAD", "SimdCvtOp",
+                   ("uint32_t",), 2, vcvtasp2usCode)
+    twoRegMiscInst("vcvta.u32.f32", "NVcvt2usAQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vcvtasp2usCode)
+
+    vcvtnsp2usCode = vcvtsp2sCode % ("false", "VfpRoundNearest")
+    twoRegMiscInst("vcvtn.u32.f32", "NVcvt2usND", "SimdCvtOp",
+                   ("uint32_t",), 2, vcvtnsp2usCode)
+    twoRegMiscInst("vcvtn.u32.f32", "NVcvt2usNQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vcvtnsp2usCode)
+
+    vcvtpsp2usCode = vcvtsp2sCode % ("false", "VfpRoundUpward")
+    twoRegMiscInst("vcvtp.u32.f32", "NVcvt2usPD", "SimdCvtOp",
+                   ("uint32_t",), 2, vcvtpsp2usCode)
+    twoRegMiscInst("vcvtp.u32.f32", "NVcvt2usPQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vcvtpsp2usCode)
+
+    vcvtmsp2usCode = vcvtsp2sCode % ("false", "VfpRoundDown")
+    twoRegMiscInst("vcvtm.u32.f32", "NVcvt2usMD", "SimdCvtOp",
+                   ("uint32_t",), 2, vcvtmsp2usCode)
+    twoRegMiscInst("vcvtm.u32.f32", "NVcvt2usMQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vcvtmsp2usCode)
+
+    vcvtasp2ssCode = vcvtsp2sCode % ("true", "VfpRoundAway")
+    twoRegMiscInst("vcvta.s32.f32", "NVcvt2ssAD", "SimdCvtOp",
+                   ("int32_t",), 2, vcvtasp2ssCode)
+    twoRegMiscInst("vcvta.s32.f32", "NVcvt2ssAQ", "SimdCvtOp",
+                   ("int32_t",), 4, vcvtasp2ssCode)
+
+    vcvtnsp2ssCode = vcvtsp2sCode % ("true", "VfpRoundNearest")
+    twoRegMiscInst("vcvtn.s32.f32", "NVcvt2ssND", "SimdCvtOp",
+                   ("int32_t",), 2, vcvtnsp2ssCode)
+    twoRegMiscInst("vcvtn.s32.f32", "NVcvt2ssNQ", "SimdCvtOp",
+                   ("int32_t",), 4, vcvtnsp2ssCode)
+
+    vcvtpsp2ssCode = vcvtsp2sCode % ("true", "VfpRoundUpward")
+    twoRegMiscInst("vcvtp.s32.f32", "NVcvt2ssPD", "SimdCvtOp",
+                   ("int32_t",), 2, vcvtpsp2ssCode)
+    twoRegMiscInst("vcvtp.s32.f32", "NVcvt2ssPQ", "SimdCvtOp",
+                   ("int32_t",), 4, vcvtpsp2ssCode)
+
+    vcvtmsp2ssCode = vcvtsp2sCode % ("true", "VfpRoundDown")
+    twoRegMiscInst("vcvtm.s32.f32", "NVcvt2ssMD", "SimdCvtOp",
+                   ("int32_t",), 2, vcvtmsp2ssCode)
+    twoRegMiscInst("vcvtm.s32.f32", "NVcvt2ssMQ", "SimdCvtOp",
+                   ("int32_t",), 4, vcvtmsp2ssCode)
+
     vrsqrteCode = '''
         destElem = unsignedRSqrtEstimate(srcElem1);
     '''
diff --git a/src/arch/generic/interrupts.hh b/src/arch/generic/interrupts.hh
index 510775594e..c2ffce038d 100644
--- a/src/arch/generic/interrupts.hh
+++ b/src/arch/generic/interrupts.hh
@@ -89,6 +89,12 @@ class BaseInterrupts : public SimObject
     {
         panic("Interrupts::clearAll unimplemented!\n");
     }
+
+    virtual bool
+    isWakeUp() const
+    {
+        return true;
+    }
 };
 
 } // namespace gem5
diff --git a/src/arch/isa_parser/isa_parser.py b/src/arch/isa_parser/isa_parser.py
index 7cc95ed6e8..0499beab83 100755
--- a/src/arch/isa_parser/isa_parser.py
+++ b/src/arch/isa_parser/isa_parser.py
@@ -111,11 +111,12 @@ class Template:
 
             operands = SubOperandList(self.parser, compositeCode, d.operands)
 
-            myDict[
-                "reg_idx_arr_decl"
-            ] = "RegId srcRegIdxArr[%d]; RegId destRegIdxArr[%d]" % (
-                d.operands.numSrcRegs + d.srcRegIdxPadding,
-                d.operands.numDestRegs + d.destRegIdxPadding,
+            myDict["reg_idx_arr_decl"] = (
+                "RegId srcRegIdxArr[%d]; RegId destRegIdxArr[%d]"
+                % (
+                    d.operands.numSrcRegs + d.srcRegIdxPadding,
+                    d.operands.numDestRegs + d.destRegIdxPadding,
+                )
             )
 
             # The reinterpret casts are largely because an array with a known
@@ -821,7 +822,7 @@ class ISAParser(Grammar):
         "DBLCOLON",
         "ASTERISK",
         # C preprocessor directives
-        "CPPDIRECTIVE"
+        "CPPDIRECTIVE",
         # The following are matched but never returned. commented out to
         # suppress PLY warning
         # newfile directive
diff --git a/src/arch/micro_asm.py b/src/arch/micro_asm.py
index 0329800896..5b4f79fce3 100644
--- a/src/arch/micro_asm.py
+++ b/src/arch/micro_asm.py
@@ -140,9 +140,9 @@ def handle_statement(parser, container, statement):
     if statement.is_microop:
         if statement.mnemonic not in parser.microops.keys():
             raise Exception(f"Unrecognized mnemonic: {statement.mnemonic}")
-        parser.symbols[
-            "__microopClassFromInsideTheAssembler"
-        ] = parser.microops[statement.mnemonic]
+        parser.symbols["__microopClassFromInsideTheAssembler"] = (
+            parser.microops[statement.mnemonic]
+        )
         try:
             microop = eval(
                 f"__microopClassFromInsideTheAssembler({statement.params})",
@@ -166,9 +166,9 @@ def handle_statement(parser, container, statement):
     elif statement.is_directive:
         if statement.name not in container.directives.keys():
             raise Exception(f"Unrecognized directive: {statement.name}")
-        parser.symbols[
-            "__directiveFunctionFromInsideTheAssembler"
-        ] = container.directives[statement.name]
+        parser.symbols["__directiveFunctionFromInsideTheAssembler"] = (
+            container.directives[statement.name]
+        )
         try:
             eval(
                 f"__directiveFunctionFromInsideTheAssembler({statement.params})",
diff --git a/src/arch/riscv/RiscvISA.py b/src/arch/riscv/RiscvISA.py
index f87941d413..05854f48c5 100644
--- a/src/arch/riscv/RiscvISA.py
+++ b/src/arch/riscv/RiscvISA.py
@@ -114,6 +114,13 @@ class RiscvISA(BaseISA):
 
     enable_Zicbom_fs = Param.Bool(True, "Enable Zicbom extension in FS mode")
     enable_Zicboz_fs = Param.Bool(True, "Enable Zicboz extension in FS mode")
+    enable_Zcd = Param.Bool(
+        True,
+        "Enable Zcd extensions. "
+        "Set the option to false implies the Zcmp and Zcmt is enable as "
+        "c.fsdsp is overlap with them."
+        "Refs: https://github.com/riscv/riscv-isa-manual/blob/main/src/zc.adoc",
+    )
 
     wfi_resume_on_pending = Param.Bool(
         False,
diff --git a/src/arch/riscv/decoder.cc b/src/arch/riscv/decoder.cc
index ee5d313587..557be1cbef 100644
--- a/src/arch/riscv/decoder.cc
+++ b/src/arch/riscv/decoder.cc
@@ -44,6 +44,7 @@ Decoder::Decoder(const RiscvDecoderParams &p) : InstDecoder(p, &machInst)
     ISA *isa = dynamic_cast<ISA*>(p.isa);
     vlen = isa->getVecLenInBits();
     elen = isa->getVecElemLenInBits();
+    _enableZcd = isa->enableZcd();
     reset();
 }
 
@@ -127,6 +128,7 @@ Decoder::decode(PCStateBase &_next_pc)
     emi.vtype8  = next_pc.vtype() & 0xff;
     emi.vill    = next_pc.vtype().vill;
     emi.rv_type = static_cast<int>(next_pc.rvType());
+    emi.enable_zcd = _enableZcd;
 
     return decode(emi, next_pc.instAddr());
 }
diff --git a/src/arch/riscv/decoder.hh b/src/arch/riscv/decoder.hh
index bf863fda22..d44455cd0b 100644
--- a/src/arch/riscv/decoder.hh
+++ b/src/arch/riscv/decoder.hh
@@ -62,6 +62,7 @@ class Decoder : public InstDecoder
 
     uint32_t vlen;
     uint32_t elen;
+    bool _enableZcd;
 
     virtual StaticInstPtr decodeInst(ExtMachInst mach_inst);
 
diff --git a/src/arch/riscv/insts/SConscript b/src/arch/riscv/insts/SConscript
index 2822cf86b4..2519b3e07d 100644
--- a/src/arch/riscv/insts/SConscript
+++ b/src/arch/riscv/insts/SConscript
@@ -34,3 +34,4 @@ Source('mem.cc', tags='riscv isa')
 Source('standard.cc', tags='riscv isa')
 Source('static_inst.cc', tags='riscv isa')
 Source('vector.cc', tags='riscv isa')
+Source('zcmp.cc', tags='riscv isa')
diff --git a/src/arch/riscv/insts/zcmp.cc b/src/arch/riscv/insts/zcmp.cc
new file mode 100644
index 0000000000..018ea45a60
--- /dev/null
+++ b/src/arch/riscv/insts/zcmp.cc
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 Google LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/riscv/insts/zcmp.hh"
+
+#include <string>
+
+#include "arch/riscv/regs/int.hh"
+#include "arch/riscv/utility.hh"
+
+namespace gem5
+{
+
+namespace RiscvISA
+{
+
+CmMacroInst::CmMacroInst(
+    const char* mnem, ExtMachInst machInst, OpClass opClass)
+    : RiscvMacroInst(mnem, machInst, opClass), rlist(machInst.rlist)
+{
+}
+
+// Ref: https://github.com/riscv-software-src/riscv-isa-sim/blob/f7d0dba60/
+//      riscv/decode.h#L168
+uint64_t
+CmMacroInst::stackAdj() const
+{
+    uint64_t stack_adj_base = 0;
+    switch (machInst.rlist) {
+      case 15:
+        stack_adj_base += 16;
+        [[fallthrough]];
+      case 14:
+        if (machInst.rv_type == RV64) {
+            stack_adj_base += 16;
+        }
+        [[fallthrough]];
+      case 13:
+      case 12:
+        stack_adj_base += 16;
+        [[fallthrough]];
+      case 11:
+      case 10:
+        if (machInst.rv_type == RV64) {
+            stack_adj_base += 16;
+        }
+        [[fallthrough]];
+      case 9:
+      case 8:
+        stack_adj_base += 16;
+        [[fallthrough]];
+      case 7:
+      case 6:
+        if (machInst.rv_type == RV64) {
+            stack_adj_base += 16;
+        }
+        [[fallthrough]];
+      case 5:
+      case 4:
+        stack_adj_base += 16;
+        break;
+    }
+
+    return stack_adj_base + machInst.spimm * 16;
+}
+
+std::string
+CmMacroInst::getRlistStr() const
+{
+    std::string s = "";
+    switch (machInst.rlist) {
+      case 15:
+        s = csprintf("{%s, %s-%s}", registerName(ReturnAddrReg),
+                     registerName(int_reg::S0),
+                     registerName(PushPopRegList[0]));
+        break;
+      case 14:
+      case 13:
+      case 12:
+      case 11:
+      case 10:
+      case 9:
+      case 8:
+      case 7:
+      case 6:
+        s = csprintf("{%s, %s-%s}", registerName(ReturnAddrReg),
+                     registerName(int_reg::S0),
+                     registerName(PushPopRegList[16-machInst.rlist]));
+        break;
+      case 5:
+        s = csprintf("{%s, %s}", registerName(ReturnAddrReg),
+                     registerName(int_reg::S0));
+        break;
+      case 4:
+        s = csprintf("{%s}", registerName(ReturnAddrReg));
+        break;
+      default:
+        break;
+    }
+
+    return s;
+}
+
+} // namespace RiscvISA
+} // namespace gem5
diff --git a/src/arch/riscv/insts/zcmp.hh b/src/arch/riscv/insts/zcmp.hh
new file mode 100644
index 0000000000..5f0d734b10
--- /dev/null
+++ b/src/arch/riscv/insts/zcmp.hh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 Google LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_RISCV_INSTS_ZCMP_HH__
+#define __ARCH_RISCV_INSTS_ZCMP_HH__
+
+#include <string>
+
+#include "arch/riscv/insts/static_inst.hh"
+#include "cpu/static_inst.hh"
+
+namespace gem5
+{
+
+namespace RiscvISA
+{
+
+class CmMacroInst : public RiscvMacroInst
+{
+  public:
+    CmMacroInst(const char* mnem, ExtMachInst machInst, OpClass opClass);
+
+  protected:
+    using RiscvMacroInst::RiscvMacroInst;
+
+    uint64_t stackAdj() const;
+    std::string getRlistStr() const;
+
+    uint64_t rlist;
+};
+
+} // namespace RiscvISA
+} // namespace gem5
+
+#endif // __ARCH_RISCV_INSTS_ZCMP_HH__
diff --git a/src/arch/riscv/interrupts.hh b/src/arch/riscv/interrupts.hh
index a10479fb65..54cf501f0a 100644
--- a/src/arch/riscv/interrupts.hh
+++ b/src/arch/riscv/interrupts.hh
@@ -95,6 +95,11 @@ class Interrupts : public BaseInterrupts
 
     void clearAll() override;
 
+    bool isWakeUp() const override
+    {
+        return checkNonMaskableInterrupt() || (ip & ie).any();
+    }
+
     uint64_t readIP() const { return (uint64_t)ip.to_ulong(); }
     uint64_t readIE() const { return (uint64_t)ie.to_ulong(); }
     void setIP(const uint64_t& val) { ip = val; }
diff --git a/src/arch/riscv/isa.cc b/src/arch/riscv/isa.cc
index bcc22d7cb0..7f4d97f4e9 100644
--- a/src/arch/riscv/isa.cc
+++ b/src/arch/riscv/isa.cc
@@ -260,7 +260,7 @@ RegClass ccRegClass(CCRegClass, CCRegClassName, 0, debug::IntRegs);
 ISA::ISA(const Params &p) : BaseISA(p, "riscv"),
     _rvType(p.riscv_type), enableRvv(p.enable_rvv), vlen(p.vlen), elen(p.elen),
     _privilegeModeSet(p.privilege_mode_set),
-    _wfiResumeOnPending(p.wfi_resume_on_pending)
+    _wfiResumeOnPending(p.wfi_resume_on_pending), _enableZcd(p.enable_Zcd)
 {
     _regClasses.push_back(&intRegClass);
     _regClasses.push_back(&floatRegClass);
diff --git a/src/arch/riscv/isa.hh b/src/arch/riscv/isa.hh
index 29a75854c7..cda2df41e6 100644
--- a/src/arch/riscv/isa.hh
+++ b/src/arch/riscv/isa.hh
@@ -108,6 +108,14 @@ class ISA : public BaseISA
     */
     const bool _wfiResumeOnPending;
 
+    /**
+     * Enable Zcd extensions.
+     * Set the option to false implies the Zcmp and Zcmt is enable as c.fsdsp
+     * is overlap with them.
+     * Refs: https://github.com/riscv/riscv-isa-manual/blob/main/src/zc.adoc
+     */
+    bool _enableZcd;
+
   public:
     using Params = RiscvISAParams;
 
@@ -184,6 +192,8 @@ class ISA : public BaseISA
 
     bool resumeOnPending() { return _wfiResumeOnPending; }
 
+    bool enableZcd() { return _enableZcd; }
+
     virtual Addr getFaultHandlerAddr(
         RegIndex idx, uint64_t cause, bool intr) const;
 };
diff --git a/src/arch/riscv/isa/bitfields.isa b/src/arch/riscv/isa/bitfields.isa
index 0b4fae7b82..5fc624acc1 100644
--- a/src/arch/riscv/isa/bitfields.isa
+++ b/src/arch/riscv/isa/bitfields.isa
@@ -34,6 +34,7 @@
 // Bitfield definitions.
 //
 def bitfield RVTYPE rv_type;
+def bitfield ENABLE_ZCD enable_zcd;
 
 def bitfield QUADRANT <1:0>;
 def bitfield OPCODE5 <6:2>;
@@ -103,10 +104,13 @@ def bitfield CFUNCT1 <12>;
 def bitfield CFUNCT1BIT6 <6>;
 def bitfield CFUNCT2HIGH <11:10>;
 def bitfield CFUNCT2LOW <6:5>;
+def bitfield CFUNCT2MID <9:8>;
 def bitfield RC1 <11:7>;
 def bitfield RC2 <6:2>;
 def bitfield RP1 <9:7>;
 def bitfield RP2 <4:2>;
+def bitfield R1S <9:7>;
+def bitfield R2S <4:2>;
 def bitfield FC1 <11:7>;
 def bitfield FC2 <6:2>;
 def bitfield FP2 <4:2>;
diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index c1dc790f26..90efb8ad82 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -54,23 +54,25 @@ decode QUADRANT default Unknown::unknown() {
             Rp2 = rvSext(sp + imm);
         }}, uint64_t);
         format CompressedLoad {
-            0x1: c_fld({{
-                offset = CIMM3 << 3 | CIMM2 << 6;
-            }}, {{
-                STATUS status = xc->readMiscReg(MISCREG_STATUS);
-                if (status.fs == FPUStatus::OFF)
-                    return std::make_shared<IllegalInstFault>("FPU is off",
-                                                               machInst);
+            0x1: decode ENABLE_ZCD {
+                0x1: c_fld({{
+                    offset = CIMM3 << 3 | CIMM2 << 6;
+                }}, {{
+                    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+                    if (status.fs == FPUStatus::OFF)
+                        return std::make_shared<IllegalInstFault>("FPU is off",
+                                                                   machInst);
 
-                // Mutating any floating point register changes the FS bit
-                // of the STATUS CSR.
-                status.fs = FPUStatus::DIRTY;
-                xc->setMiscReg(MISCREG_STATUS, status);
+                    // Mutating any floating point register changes the FS bit
+                    // of the STATUS CSR.
+                    status.fs = FPUStatus::DIRTY;
+                    xc->setMiscReg(MISCREG_STATUS, status);
 
-                Fp2_bits = Mem;
-            }}, {{
-                EA = rvSext(Rp1 + offset);
-            }});
+                    Fp2_bits = Mem;
+                }}, {{
+                    EA = rvSext(Rp1 + offset);
+                }});
+            }
             0x2: c_lw({{
                 offset = CIMM2<1:1> << 2 |
                          CIMM3 << 3 |
@@ -152,18 +154,20 @@ decode QUADRANT default Unknown::unknown() {
             }
         }
         format CompressedStore {
-            0x5: c_fsd({{
-                offset = CIMM3 << 3 | CIMM2 << 6;
-            }}, {{
-                STATUS status = xc->readMiscReg(MISCREG_STATUS);
-                if (status.fs == FPUStatus::OFF)
-                    return std::make_shared<IllegalInstFault>("FPU is off",
-                                                               machInst);
+            0x5: decode ENABLE_ZCD {
+                0x1: c_fsd({{
+                    offset = CIMM3 << 3 | CIMM2 << 6;
+                }}, {{
+                    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+                    if (status.fs == FPUStatus::OFF)
+                        return std::make_shared<IllegalInstFault>("FPU is off",
+                                                                   machInst);
 
-                Mem = Fp2_bits;
-            }}, {{
-                EA = rvSext(Rp1 + offset);
-            }});
+                    Mem = Fp2_bits;
+                }}, {{
+                    EA = rvSext(Rp1 + offset);
+                }});
+            }
             0x6: c_sw({{
                 offset = CIMM2<1:1> << 2 |
                          CIMM3 << 3 |
@@ -381,23 +385,25 @@ decode QUADRANT default Unknown::unknown() {
             Rc1 = rvSext(Rc1 << imm);
         }}, uint64_t);
         format CompressedLoad {
-            0x1: c_fldsp({{
-                offset = CIMM5<4:3> << 3 |
-                         CIMM1 << 5 |
-                         CIMM5<2:0> << 6;
-            }}, {{
-                STATUS status = xc->readMiscReg(MISCREG_STATUS);
-                if (status.fs == FPUStatus::OFF)
-                    return std::make_shared<IllegalInstFault>("FPU is off",
-                                                               machInst);
+            0x1: decode ENABLE_ZCD {
+                0x1: c_fldsp({{
+                    offset = CIMM5<4:3> << 3 |
+                             CIMM1 << 5 |
+                             CIMM5<2:0> << 6;
+                }}, {{
+                    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+                    if (status.fs == FPUStatus::OFF)
+                        return std::make_shared<IllegalInstFault>("FPU is off",
+                                                                   machInst);
 
-                status.fs = FPUStatus::DIRTY;
-                xc->setMiscReg(MISCREG_STATUS, status);
+                    status.fs = FPUStatus::DIRTY;
+                    xc->setMiscReg(MISCREG_STATUS, status);
 
-                Fc1_bits = Mem;
-            }}, {{
-                EA = rvSext(sp + offset);
-            }});
+                    Fc1_bits = Mem;
+                }}, {{
+                    EA = rvSext(sp + offset);
+                }});
+            }
             0x2: c_lwsp({{
                 offset = CIMM5<4:2> << 2 |
                          CIMM1 << 5 |
@@ -480,19 +486,35 @@ decode QUADRANT default Unknown::unknown() {
             }
         }
         format CompressedStore {
-            0x5: c_fsdsp({{
-                offset = CIMM6<5:3> << 3 |
-                         CIMM6<2:0> << 6;
-            }}, {{
-                STATUS status = xc->readMiscReg(MISCREG_STATUS);
-                if (status.fs == FPUStatus::OFF)
-                    return std::make_shared<IllegalInstFault>("FPU is off",
-                                                               machInst);
+            0x5: decode ENABLE_ZCD {
+                0x0: decode CFUNCT6LOW3 {
+                    0x3: decode CFUNCT2LOW {
+                        0x1: CmMvsa01::cm_mvsa01();
+                        0x3: CmMva01s::cm_mva01s();
+                    }
+                    0x6: decode CFUNCT2MID {
+                        0x0: CmPush::cm_push();
+                        0x2: CmPop::cm_pop();
+                    }
+                    0x7: decode CFUNCT2MID {
+                        0x0: CmPop::cm_popretz(is_ret=True, has_a0=True);
+                        0x2: CmPop::cm_popret(is_ret=True);
+                    }
+                }
+                0x1: c_fsdsp({{
+                    offset = CIMM6<5:3> << 3 |
+                             CIMM6<2:0> << 6;
+                }}, {{
+                    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+                    if (status.fs == FPUStatus::OFF)
+                        return std::make_shared<IllegalInstFault>("FPU is off",
+                                                                   machInst);
 
-                Mem_ud = Fc2_bits;
-            }}, {{
-                EA = rvSext(sp + offset);
-            }});
+                    Mem_ud = Fc2_bits;
+                }}, {{
+                    EA = rvSext(sp + offset);
+                }});
+            }
             0x6: c_swsp({{
                 offset = CIMM6<5:2> << 2 |
                          CIMM6<1:0> << 6;
diff --git a/src/arch/riscv/isa/formats/formats.isa b/src/arch/riscv/isa/formats/formats.isa
index 0102df17d7..377bc5d061 100644
--- a/src/arch/riscv/isa/formats/formats.isa
+++ b/src/arch/riscv/isa/formats/formats.isa
@@ -40,6 +40,7 @@
 ##include "vector_conf.isa"
 ##include "vector_arith.isa"
 ##include "vector_mem.isa"
+##include "zcmp.isa"
 
 // Include formats for nonstandard extensions
 ##include "compressed.isa"
diff --git a/src/arch/riscv/isa/formats/zcmp.isa b/src/arch/riscv/isa/formats/zcmp.isa
new file mode 100644
index 0000000000..263c880022
--- /dev/null
+++ b/src/arch/riscv/isa/formats/zcmp.isa
@@ -0,0 +1,782 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2015 RISC-V Foundation
+// Copyright (c) 2016 The University of Virginia
+// Copyright (c) 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Cmpush template.
+def template CmPushDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst);
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+    };
+}};
+
+
+def template CmPushConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst) :
+      %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        StaticInstPtr cur_inst = nullptr;
+        if (rlist < 4) {
+            cur_inst = new Unknown(machInst);
+            cur_inst->setFlag(IsMicroop);
+            cur_inst->setDelayedCommit();
+            microops.emplace_back(cur_inst);
+        } else {
+            int start_reg = 0;
+            if (rlist != 15) {
+                start_reg = (16-rlist);
+            }
+
+            int offset = 0;
+            for (int i = start_reg; i < PushPopRegList.size(); i++) {
+                offset -= rvSelect(4, 8);
+
+                if (machInst.rv_type == RV32) {
+                    cur_inst = new %(class_name)s32MicroInst(
+                        machInst, PushPopRegList[i], offset);
+                } else {
+                    cur_inst = new %(class_name)s64MicroInst(
+                        machInst, PushPopRegList[i], offset);
+                }
+                cur_inst->setDelayedCommit();
+                microops.emplace_back(cur_inst);
+            }
+
+            cur_inst = new %(class_name)sSpAdjMicroInst(machInst, -stackAdj());
+            cur_inst->setDelayedCommit();
+            microops.emplace_back(cur_inst);
+        }
+
+        microops.front()->setFirstMicroop();
+        microops.back()->setLastMicroop();
+    }
+}};
+
+def template CmPushExecute {{
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << getRlistStr() << ", " << (int64_t)-stackAdj();
+        return ss.str();
+    }
+}};
+
+def template CmStoreMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, RegId push_reg, int64_t offset);
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+        Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+        Fault completeAcc(
+            Packet *, ExecContext *, trace::InstRecord *) const override;
+        std::string generateDisassembly(
+            Addr, const loader::SymbolTable *) const override;
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+      private:
+        %(reg_idx_arr_decl)s;
+
+        int64_t offset;
+        Request::Flags memAccessFlags;
+    };
+}};
+
+def template CmStoreMicroConstructor {{
+    %(class_name)s::%(class_name)s(
+        ExtMachInst machInst, RegId push_reg, int64_t offset)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s),
+        offset(offset)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template CmStoreMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        Addr EA;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        %(memacc_code)s;
+
+        {
+            Fault fault =
+                writeMemAtomicLE(xc, traceData, Mem, EA, memAccessFlags,
+                        nullptr);
+            if (fault != NoFault)
+                return fault;
+        }
+
+        %(op_wb)s;
+
+        return NoFault;
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(srcRegIdx(1)) << ", " <<
+            offset << '(' << registerName(srcRegIdx(0)) << ')';
+        return ss.str();
+    }
+}};
+
+def template CmStoreMicroInitiateAcc {{
+    Fault
+    %(class_name)s::initiateAcc(ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        Addr EA;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        %(memacc_code)s;
+
+        {
+            Fault fault = writeMemTimingLE(xc, traceData, Mem, EA,
+                memAccessFlags, nullptr);
+            if (fault != NoFault)
+                return fault;
+        }
+
+        %(op_wb)s;
+
+        return NoFault;
+    }
+}};
+
+def template CmStoreMicroCompleteAcc {{
+    Fault
+    %(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        return NoFault;
+    }
+}};
+
+def template SpAdjMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, int64_t adj);
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+        std::string generateDisassembly(
+            Addr, const loader::SymbolTable *) const override;
+
+      private:
+        %(reg_idx_arr_decl)s;
+
+        int64_t adj;
+    };
+}};
+
+def template SpAdjMicroConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst, int64_t adj)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s), adj(adj)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template SpAdjMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        %(op_decl)s;
+        %(op_rd)s;
+        %(code)s;
+        %(op_wb)s;
+        return NoFault;
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ' '
+            << registerName(srcRegIdx(0)) << ' ' << adj;
+        return ss.str();
+    }
+}};
+
+// Cmpop decode template.
+def template CmPopDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst);
+
+        std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+    };
+}};
+
+
+def template CmPopConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst) :
+      %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        StaticInstPtr cur_inst = nullptr;
+        if (rlist < 4) {
+            cur_inst = new Unknown(machInst);
+            cur_inst->setFlag(IsMicroop);
+            cur_inst->setDelayedCommit();
+            microops.emplace_back(cur_inst);
+        } else {
+            int start_reg = 0;
+            if (rlist != 15) {
+                start_reg = (16-rlist);
+            }
+
+            int offset = stackAdj();
+            for (int i = start_reg; i < PushPopRegList.size(); i++) {
+                offset -= rvSelect(4, 8);
+
+                if (machInst.rv_type == RV32) {
+                    cur_inst = new %(class_name)s32MicroInst(
+                        machInst, PushPopRegList[i], offset);
+                } else {
+                    cur_inst = new %(class_name)s64MicroInst(
+                        machInst, PushPopRegList[i], offset);
+                }
+                cur_inst->setDelayedCommit();
+                microops.emplace_back(cur_inst);
+            }
+
+            cur_inst = new %(class_name)sSpAdjMicroInst(machInst, stackAdj());
+            cur_inst->setDelayedCommit();
+            microops.emplace_back(cur_inst);
+
+            %(move_a0_desc)s;
+            %(return_desc)s;
+        }
+
+        microops.front()->setFirstMicroop();
+        microops.back()->setLastMicroop();
+    }
+}};
+
+def template CmPopExecute {{
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << getRlistStr() << ", " << stackAdj();
+        return ss.str();
+    }
+}};
+
+def template CmLoadMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, RegId pop_reg, int64_t offset);
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+        Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+        Fault completeAcc(
+            Packet *, ExecContext *, trace::InstRecord *) const override;
+        std::string generateDisassembly(
+            Addr, const loader::SymbolTable *) const override;
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+      private:
+        %(reg_idx_arr_decl)s;
+
+        int64_t offset;
+        Request::Flags memAccessFlags;
+    };
+}};
+
+def template CmLoadMicroConstructor {{
+    %(class_name)s::%(class_name)s(
+        ExtMachInst machInst, RegId pop_reg, int64_t offset)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s),
+        offset(offset)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template CmLoadMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        Addr EA;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        {
+            Fault fault =
+                readMemAtomicLE(xc, traceData, EA, Mem, memAccessFlags);
+            if (fault != NoFault)
+                return fault;
+        }
+
+        %(memacc_code)s;
+
+        %(op_wb)s;
+
+        return NoFault;
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+            offset << '(' << registerName(srcRegIdx(0)) << ')';
+        return ss.str();
+    }
+}};
+
+def template CmLoadMicroInitiateAcc {{
+    Fault
+    %(class_name)s::initiateAcc(ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        Addr EA;
+
+        %(op_src_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        return initiateMemRead(xc, traceData, EA, Mem, memAccessFlags);
+    }
+}};
+
+def template CmLoadMicroCompleteAcc {{
+    Fault
+    %(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        %(op_decl)s;
+        %(op_rd)s;
+
+        getMemLE(pkt, Mem, traceData);
+
+        %(memacc_code)s;
+        %(op_wb)s;
+
+        return NoFault;
+    }
+}};
+
+def template CmRetMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst);
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+
+        std::string
+        generateDisassembly(
+                Addr pc, const loader::SymbolTable *symtab) const override;
+
+        std::unique_ptr<PCStateBase> branchTarget(
+                ThreadContext *tc) const override;
+
+        using StaticInst::branchTarget;
+
+      private:
+        %(reg_idx_arr_decl)s;
+    };
+}};
+
+def template CmRetMicroConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template CmRetMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        %(op_decl)s;
+        %(op_rd)s;
+        %(code)s;
+        %(op_wb)s;
+        return NoFault;
+    }
+
+    std::unique_ptr<PCStateBase>
+    %(class_name)s::branchTarget(ThreadContext *tc) const
+    {
+        PCStateBase *pc_ptr = tc->pcState().clone();
+        pc_ptr->as<PCState>().set(rvSext(tc->getReg(srcRegIdx(0)) & ~0x1));
+        return std::unique_ptr<PCStateBase>{pc_ptr};
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(srcRegIdx(0));
+        return ss.str();
+    }
+}};
+
+// Cmmvsa01 decode template
+def template CmMvDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst);
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+    };
+}};
+
+def template CmMvsa01Constructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        StaticInstPtr cur_inst;
+        cur_inst = new %(class_name)sMvMicroInst(
+            machInst, int_reg::A0, StackRegs[machInst.r1s]);
+        microops.emplace_back(cur_inst);
+        cur_inst = new %(class_name)sMvMicroInst(
+            machInst, int_reg::A1, StackRegs[machInst.r2s]);
+        microops.emplace_back(cur_inst);
+
+        microops.front()->setFirstMicroop();
+        microops.back()->setLastMicroop();
+    }
+}};
+
+def template CmMva01sConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        StaticInstPtr cur_inst;
+        cur_inst = new %(class_name)sMvMicroInst(
+            machInst, StackRegs[machInst.r1s], int_reg::A0);
+        cur_inst->setDelayedCommit();
+        microops.emplace_back(cur_inst);
+        cur_inst = new %(class_name)sMvMicroInst(
+            machInst, StackRegs[machInst.r2s], int_reg::A1);
+        cur_inst->setDelayedCommit();
+        microops.emplace_back(cur_inst);
+
+        microops.front()->setFirstMicroop();
+        microops.back()->setLastMicroop();
+    }
+}};
+
+def template CmMvExecute {{
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(StackRegs[machInst.r1s])
+            << ", " << registerName(StackRegs[machInst.r2s]);
+        return ss.str();
+    }
+}};
+
+def template CmMvMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, RegId push_reg, RegId pop_reg);
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+        std::string generateDisassembly(
+            Addr, const loader::SymbolTable *) const override;
+
+      private:
+        %(reg_idx_arr_decl)s;
+    };
+}};
+
+def template CmMvMicroConstructor {{
+    %(class_name)s::%(class_name)s(
+        ExtMachInst machInst, RegId push_reg, RegId pop_reg)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template CmMvMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        %(op_decl)s;
+        %(op_rd)s;
+        %(code)s;
+        %(op_wb)s;
+        return NoFault;
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ' '
+            << registerName(srcRegIdx(0));
+        return ss.str();
+    }
+}};
+
+def format CmPush(*flags) {{
+    code = ''
+    macro_iop = InstObjParams(name, Name, 'CmMacroInst', code, flags)
+    header_output = CmPushDeclare.subst(macro_iop)
+    decoder_output = CmPushConstructor.subst(macro_iop)
+    exec_output = CmPushExecute.subst(macro_iop)
+    decode_block = BasicDecode.subst(macro_iop)
+
+    memacc_code = 'Mem_sw = CmPushReg_sw;'
+    ea_code = 'EA = rvSext(sp + offset);'
+    micro32_iop = InstObjParams('lw', f'{Name}32MicroInst', 'RiscvMicroInst',
+        {'ea_code': ea_code, 'memacc_code': memacc_code},
+        flags)
+
+    mem_flags = [getAlignFlag(micro32_iop)]
+    s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+    micro32_iop.constructor += s
+
+    header_output += CmStoreMicroDeclare.subst(micro32_iop)
+    decoder_output += CmStoreMicroConstructor.subst(micro32_iop)
+    exec_output += CmStoreMicroExecute.subst(micro32_iop) \
+        + CmStoreMicroInitiateAcc.subst(micro32_iop) \
+        + CmStoreMicroCompleteAcc.subst(micro32_iop)
+
+    memacc_code = 'Mem = CmPushReg;'
+    ea_code = 'EA = rvSext(sp + offset);'
+    micro64_iop = InstObjParams('ld', f'{Name}64MicroInst', 'RiscvMicroInst',
+        {'ea_code': ea_code, 'memacc_code': memacc_code},
+        flags)
+
+    mem_flags = [getAlignFlag(micro64_iop)]
+    s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+    micro64_iop.constructor += s
+
+    header_output += CmStoreMicroDeclare.subst(micro64_iop)
+    decoder_output += CmStoreMicroConstructor.subst(micro64_iop)
+    exec_output += CmStoreMicroExecute.subst(micro64_iop) \
+        + CmStoreMicroInitiateAcc.subst(micro64_iop) \
+        + CmStoreMicroCompleteAcc.subst(micro64_iop)
+
+    code = 'spd = rvSext(sp + adj);'
+    sp_adj_iop = InstObjParams('addi', f'{Name}SpAdjMicroInst',
+        'RiscvMicroInst', code, flags)
+
+    header_output += SpAdjMicroDeclare.subst(sp_adj_iop)
+    decoder_output += SpAdjMicroConstructor.subst(sp_adj_iop)
+    exec_output += SpAdjMicroExecute.subst(sp_adj_iop)
+}};
+
+def format CmPop(is_ret=False, has_a0=False, *flags) {{
+    code = ''
+    flags = []
+    has_a0 = eval(has_a0)
+    is_ret = eval(is_ret)
+    move_a0_desc = ''
+    return_desc = ''
+
+    if has_a0:
+        move_a0_desc = rf'''
+          cur_inst = new {Name}MvMicroInst(
+              machInst, ReturnValueReg, int_reg::Zero);
+          microops.emplace_back(cur_inst);
+       '''
+
+    if is_ret:
+        return_desc = rf'''
+          cur_inst = new {Name}RetMicroInst(machInst);
+          microops.emplace_back(cur_inst);
+       '''
+
+    macro_iop = InstObjParams(name, Name, 'CmMacroInst',
+        {'code': code, 'move_a0_desc': move_a0_desc,
+         'return_desc': return_desc},
+        flags)
+    header_output = CmPopDeclare.subst(macro_iop)
+    decoder_output = CmPopConstructor.subst(macro_iop)
+    exec_output = CmPopExecute.subst(macro_iop)
+    decode_block = BasicDecode.subst(macro_iop)
+
+    memacc_code = 'CmPopReg_sw = Mem_sw;'
+    ea_code = 'EA = rvSext(sp + offset);'
+    micro32_iop = InstObjParams('lw', f'{Name}32MicroInst', 'RiscvMicroInst',
+        {'ea_code': ea_code, 'memacc_code': memacc_code},
+        flags)
+
+    mem_flags = [getAlignFlag(micro32_iop)]
+    s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+    micro32_iop.constructor += s
+
+    header_output += CmLoadMicroDeclare.subst(micro32_iop)
+    decoder_output += CmLoadMicroConstructor.subst(micro32_iop)
+    exec_output += CmLoadMicroExecute.subst(micro32_iop) \
+        + CmLoadMicroInitiateAcc.subst(micro32_iop) \
+        + CmLoadMicroCompleteAcc.subst(micro32_iop)
+
+    memacc_code = 'CmPopReg = Mem;'
+    ea_code = 'EA = rvSext(sp + offset);'
+    micro64_iop = InstObjParams('ld', f'{Name}64MicroInst', 'RiscvMicroInst',
+        {'ea_code': ea_code, 'memacc_code': memacc_code},
+        flags)
+
+    mem_flags = [getAlignFlag(micro64_iop)]
+    s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+    micro64_iop.constructor += s
+
+    header_output += CmLoadMicroDeclare.subst(micro64_iop)
+    decoder_output += CmLoadMicroConstructor.subst(micro64_iop)
+    exec_output += CmLoadMicroExecute.subst(micro64_iop) \
+        + CmLoadMicroInitiateAcc.subst(micro64_iop) \
+        + CmLoadMicroCompleteAcc.subst(micro64_iop)
+
+    code = 'spd = rvSext(sp + adj);'
+    sp_adj_iop = InstObjParams('addi', f'{Name}SpAdjMicroInst',
+        'RiscvMicroInst', code, flags)
+
+    header_output += SpAdjMicroDeclare.subst(sp_adj_iop)
+    decoder_output += SpAdjMicroConstructor.subst(sp_adj_iop)
+    exec_output += SpAdjMicroExecute.subst(sp_adj_iop)
+
+    if has_a0:
+        code = 'CmPopReg = CmPushReg;'
+        has_a0_iop = InstObjParams('mv', f'{Name}MvMicroInst',
+            'RiscvMicroInst', code, flags)
+
+        header_output += CmMvMicroDeclare.subst(has_a0_iop)
+        decoder_output += CmMvMicroConstructor.subst(has_a0_iop)
+        exec_output += CmMvMicroExecute.subst(has_a0_iop)
+
+    if is_ret:
+        code = 'NPC = rvSext(ra & (~0x1));'
+        ret_flags = ['IsIndirectControl', 'IsUncondControl', 'IsReturn']
+        is_ret_iop = InstObjParams('jr', f'{Name}RetMicroInst',
+            'RiscvMicroInst', code, ret_flags)
+
+        header_output += CmRetMicroDeclare.subst(is_ret_iop)
+        decoder_output += CmRetMicroConstructor.subst(is_ret_iop)
+        exec_output += CmRetMicroExecute.subst(is_ret_iop)
+}};
+
+def format CmMvsa01() {{
+    code = ''
+    flags = []
+    iop = InstObjParams(name, Name, 'RiscvMacroInst', code, flags)
+    header_output = CmMvDeclare.subst(iop)
+    decoder_output = CmMvsa01Constructor.subst(iop)
+    exec_output = CmMvExecute.subst(iop)
+    decode_block = BasicDecode.subst(iop)
+
+    code = 'CmPopReg = CmPushReg;'
+    micro_iop = InstObjParams('mv', f'{Name}MvMicroInst', 'RiscvMicroInst',
+        code, flags)
+
+    header_output += CmMvMicroDeclare.subst(micro_iop)
+    decoder_output += CmMvMicroConstructor.subst(micro_iop)
+    exec_output += CmMvMicroExecute.subst(micro_iop)
+}};
+
+def format CmMva01s() {{
+    code = ''
+    flags = []
+    iop = InstObjParams(name, Name, 'RiscvMacroInst', code, flags)
+    header_output = CmMvDeclare.subst(iop)
+    decoder_output = CmMva01sConstructor.subst(iop)
+    exec_output = CmMvExecute.subst(iop)
+    decode_block = BasicDecode.subst(iop)
+
+    code = 'CmPopReg = CmPushReg;'
+    micro_iop = InstObjParams('mv', f'{Name}MvMicroInst', 'RiscvMicroInst',
+        code, flags)
+
+    header_output += CmMvMicroDeclare.subst(micro_iop)
+    decoder_output += CmMvMicroConstructor.subst(micro_iop)
+    exec_output += CmMvMicroExecute.subst(micro_iop)
+}};
diff --git a/src/arch/riscv/isa/includes.isa b/src/arch/riscv/isa/includes.isa
index b37e62bca8..4d53958723 100644
--- a/src/arch/riscv/isa/includes.isa
+++ b/src/arch/riscv/isa/includes.isa
@@ -55,6 +55,7 @@ output header {{
 #include "arch/riscv/insts/static_inst.hh"
 #include "arch/riscv/insts/unknown.hh"
 #include "arch/riscv/insts/vector.hh"
+#include "arch/riscv/insts/zcmp.hh"
 #include "arch/riscv/interrupts.hh"
 #include "cpu/static_inst.hh"
 #include "mem/packet.hh"
diff --git a/src/arch/riscv/isa/operands.isa b/src/arch/riscv/isa/operands.isa
index de36d902b1..e2a7522b94 100644
--- a/src/arch/riscv/isa/operands.isa
+++ b/src/arch/riscv/isa/operands.isa
@@ -70,10 +70,14 @@ def operands {{
     'Rp2': IntReg('ud', 'RP2 + 8', 'IsInteger', 3),
     'ra': IntReg('ud', 'ReturnAddrReg', 'IsInteger', 1),
     'sp': IntReg('ud', 'StackPointerReg', 'IsInteger', 2),
+    'spd': IntReg('ud', 'StackPointerReg', 'IsInteger', 1),
 
     'a0': IntReg('ud', '10', 'IsInteger', 1),
     'a1': IntReg('ud', '11', 'IsInteger', 2),
 
+    'CmPushReg': IntReg('ud', 'push_reg', 'IsInteger', 3),
+    'CmPopReg': IntReg('ud', 'pop_reg', 'IsInteger', 1),
+
     'Fd': FloatRegOp('df', 'FD', 'IsFloating', 1),
     'Fd_bits': FloatRegOp('ud', 'FD', 'IsFloating', 1),
     'Fs1': FloatRegOp('df', 'FS1', 'IsFloating', 2),
diff --git a/src/arch/riscv/linux/linux.hh b/src/arch/riscv/linux/linux.hh
index 997eb6af4c..17281340d7 100644
--- a/src/arch/riscv/linux/linux.hh
+++ b/src/arch/riscv/linux/linux.hh
@@ -34,6 +34,7 @@
 #include "arch/riscv/utility.hh"
 #include "kern/linux/flag_tables.hh"
 #include "kern/linux/linux.hh"
+#include "base/bitfield.hh"
 
 namespace gem5
 {
@@ -42,6 +43,101 @@ class RiscvLinux : public Linux
 {
   public:
     static const ByteOrder byteOrder = ByteOrder::little;
+
+    enum RiscvHwprobeKey
+    {
+        Mvendorid,
+        Marchid,
+        Mimpid,
+        BaseBehavior,
+        IMAExt0,
+        Cpuperf0,
+        ZicbozBlockSize,
+        HighestVirtAddress,
+        TimeCsrFreq,
+        MisalignedScalarPerf
+    };
+
+    /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
+    #define RISCV_HWPROBE_MAX_KEY 9
+
+    BitUnion64(key_base_behavior_t)
+    Bitfield<0> ima;
+    EndBitUnion(key_base_behavior_t)
+
+    BitUnion64(key_ima_ext_0_t)
+        Bitfield<49> ZAWRS;
+        Bitfield<48> ZCMOP;
+        Bitfield<47> ZCF;
+        Bitfield<46> ZCD;
+        Bitfield<45> ZCB;
+        Bitfield<44> ZCA;
+        Bitfield<43> ZIMOP;
+        Bitfield<42> ZVE64D;
+        Bitfield<41> ZVE64F;
+        Bitfield<40> ZVE64X;
+        Bitfield<39> ZVE32F;
+        Bitfield<38> ZVE32X;
+        Bitfield<37> ZIHINTPAUSE;
+        Bitfield<36> ZICOND;
+        Bitfield<35> ZACAS;
+        Bitfield<34> ZTSO;
+        Bitfield<33> ZFA;
+        Bitfield<32> ZVFHMIN;
+        Bitfield<31> ZVFH;
+        Bitfield<30> ZIHINTNTL;
+        Bitfield<29> ZFHMIN;
+        Bitfield<28> ZFH;
+        Bitfield<27> ZVKT;
+        Bitfield<26> ZVKSH;
+        Bitfield<25> ZVKSED;
+        Bitfield<24> ZVKNHB;
+        Bitfield<22> ZVKNHA;
+        Bitfield<21> ZVKNED;
+        Bitfield<20> ZVKG;
+        Bitfield<19> ZVKB;
+        Bitfield<18> ZVBC;
+        Bitfield<17> ZVBB;
+        Bitfield<16> ZKT;
+        Bitfield<15> ZKSH;
+        Bitfield<14> ZKSED;
+        Bitfield<13> ZKNH;
+        Bitfield<12> ZKNE;
+        Bitfield<11> ZKND;
+        Bitfield<10> ZBKX;
+        Bitfield<9>  ZBKC;
+        Bitfield<8>  ZBKB;
+        Bitfield<7>  ZBC;
+        Bitfield<6>  ZICBOZ;
+        Bitfield<5>  ZBS;
+        Bitfield<4>  ZBB;
+        Bitfield<3>  ZBA;
+        Bitfield<2>  V;
+        Bitfield<1>  C;
+        Bitfield<0>  FD;
+    EndBitUnion(key_ima_ext_0_t)
+
+    enum MisalignedScalarPerf
+    {
+        Unknown,
+        Emulated,
+        Slow,
+        Fast,
+        Unsupported
+    };
+
+    /* Flags */
+    #define RISCV_HWPROBE_WHICH_CPUS	(1 << 0)
+
+    struct riscv_hwprobe {
+        int64_t  key;
+        uint64_t value;
+    };
+
+    typedef struct cpumask {
+        size_t size;
+        uint64_t bits[];
+    } cpumask_t;
 };
 
 class RiscvLinux64 : public RiscvLinux, public OpenFlagTable<RiscvLinux64>
@@ -195,6 +291,21 @@ class RiscvLinux64 : public RiscvLinux, public OpenFlagTable<RiscvLinux64>
         uint32_t mem_unit;
     };
 
+    struct tgt_clone_args
+    {
+        uint64_t flags;
+        uint64_t pidfd;
+        uint64_t child_tid;
+        uint64_t parent_tid;
+        uint64_t exit_signal;
+        uint64_t stack;
+        uint64_t stack_size;
+        uint64_t tls;
+        uint64_t set_tid;
+        uint64_t set_tid_size;
+        uint64_t cgroup;
+    };
+
     static void
     archClone(uint64_t flags,
               Process *pp, Process *cp,
diff --git a/src/arch/riscv/linux/se_workload.cc b/src/arch/riscv/linux/se_workload.cc
index c1af16fb3b..d3015202b7 100644
--- a/src/arch/riscv/linux/se_workload.cc
+++ b/src/arch/riscv/linux/se_workload.cc
@@ -44,6 +44,8 @@
 #include <sys/syscall.h>
 
 #include "arch/riscv/process.hh"
+#include "arch/riscv/insts/static_inst.hh"
+#include "arch/riscv/regs/misc.hh"
 #include "base/loader/object_file.hh"
 #include "base/trace.hh"
 #include "cpu/thread_context.hh"
@@ -134,6 +136,388 @@ unameFunc32(SyscallDesc *desc, ThreadContext *tc, VPtr<Linux::utsname> name)
     return 0;
 }
 
+static inline void
+cpumask_set_cpu(unsigned int cpu, RiscvLinux::cpumask_t *dstp)
+{
+    assert(cpu < dstp->size * 8);
+    auto &bits = dstp->bits[cpu / sizeof(uint64_t)];
+    bits = insertBits(bits, cpu % sizeof(uint64_t), 1);
+}
+
+static inline void
+cpumask_clear_cpu(unsigned int cpu, RiscvLinux::cpumask_t *dstp)
+{
+    assert(cpu < dstp->size * 8);
+    auto &bits = dstp->bits[cpu / sizeof(uint64_t)];
+    bits = insertBits(bits, cpu % sizeof(uint64_t), 0);
+}
+
+static inline bool
+cpumask_test_cpu(unsigned int cpu, const RiscvLinux::cpumask_t *cpumask)
+{
+    assert(cpu < cpumask->size * 8);
+    return bits(cpumask->bits[cpu / sizeof(uint64_t)], cpu % sizeof(uint64_t)) != 0;
+}
+
+static inline void
+cpumask_and(RiscvLinux::cpumask_t *dstp, const RiscvLinux::cpumask_t *src1p,
+            const RiscvLinux::cpumask_t *src2p)
+{
+    assert(dstp->size == src1p->size);
+    assert(dstp->size == src2p->size);
+    for (size_t i = 0; i < dstp->size / sizeof(dstp->bits[0]); i++) {
+        dstp->bits[i] = src1p->bits[i] & src2p->bits[i];
+    }
+}
+
+static inline bool
+cpumask_empty(const RiscvLinux::cpumask_t *dstp)
+{
+    for (size_t i = 0; i < dstp->size / sizeof(dstp->bits[0]); i++) {
+        if (dstp->bits[i] != 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static inline void
+cpumask_copy(RiscvLinux::cpumask_t *dstp, const RiscvLinux::cpumask_t *srcp)
+{
+    assert(dstp->size == srcp->size);
+    memcpy(dstp->bits, srcp->bits, srcp->size);
+}
+
+static inline void
+cpumask_clear(RiscvLinux::cpumask_t *dstp)
+{
+    memset(dstp->bits, 0, dstp->size);
+}
+
+static inline RiscvLinux::cpumask_t *
+cpumask_malloc(ThreadContext *tc)
+{
+    RiscvLinux::cpumask_t *cpumask;
+
+    /* 8-bytes up-boundary alignment */
+    size_t size = (tc->getSystemPtr()->threads.size() + sizeof(cpumask->bits[0]) - 1) /
+                    sizeof(cpumask->bits[0]) * sizeof(cpumask->bits[0]);
+    cpumask = (RiscvLinux::cpumask_t *)malloc(sizeof(cpumask->size) + size);
+    if (cpumask != nullptr) {
+        cpumask->size = size;
+        cpumask_clear(cpumask);
+    }
+
+    return cpumask;
+}
+
+static inline void
+cpumask_free(RiscvLinux::cpumask_t *cpu_online_mask)
+{
+    free(cpu_online_mask);
+}
+
+static inline bool
+riscv_hwprobe_key_is_valid(int64_t key)
+{
+    return key >= 0 && key <= RISCV_HWPROBE_MAX_KEY;
+}
+
+static inline bool
+hwprobe_key_is_bitmask(int64_t key)
+{
+    switch (key) {
+    case RiscvLinux::BaseBehavior:
+    case RiscvLinux::IMAExt0:
+    case RiscvLinux::Cpuperf0:
+        return true;
+    }
+
+    return false;
+}
+
+static inline bool
+riscv_hwprobe_pair_cmp(RiscvLinux::riscv_hwprobe *pair,
+                       RiscvLinux::riscv_hwprobe *other_pair)
+{
+    if (pair->key != other_pair->key) {
+        return false;
+    }
+
+    if (hwprobe_key_is_bitmask(pair->key)) {
+        return (pair->value & other_pair->value) == other_pair->value;
+    }
+
+    return pair->value == other_pair->value;
+}
+
+static inline RiscvLinux::cpumask_t *
+get_cpu_online_mask(ThreadContext *tc)
+{
+    RiscvLinux::cpumask_t *cpu_online_mask = cpumask_malloc(tc);
+    if (cpu_online_mask != nullptr) {
+        for (int i = 0; i < tc->getSystemPtr()->threads.size(); i++) {
+            CPU_SET(i, (cpu_set_t *)&cpu_online_mask->bits);
+        }
+    }
+
+    return cpu_online_mask;
+}
+
+static void
+hwprobe_one_pair(ThreadContext *tc, RiscvLinux::riscv_hwprobe *pair,
+                 RiscvLinux::cpumask_t *cpus)
+{
+    switch (pair->key) {
+    case RiscvLinux::Mvendorid:
+        pair->value = tc->readMiscRegNoEffect(CSRData.at(CSR_MVENDORID).physIndex);
+        break;
+    case RiscvLinux::Marchid:
+        pair->value = tc->readMiscRegNoEffect(CSRData.at(CSR_MARCHID).physIndex);
+        break;
+    case RiscvLinux::Mimpid:
+        pair->value = tc->readMiscRegNoEffect(CSRData.at(CSR_MIMPID).physIndex);
+        break;
+    case RiscvLinux::BaseBehavior:
+        {
+            MISA misa = tc->readMiscRegNoEffect(MISCREG_ISA);
+            RiscvLinux::key_base_behavior_t *base_behavior =
+                (RiscvLinux::key_base_behavior_t *)&pair->value;
+            if (misa.rvi && misa.rvm && misa.rva) {
+                base_behavior->ima = 1;
+            }
+        }
+        break;
+    case RiscvLinux::IMAExt0:
+        {
+            MISA misa = tc->readMiscRegNoEffect(MISCREG_ISA);
+            RiscvLinux::key_ima_ext_0_t *ext = (RiscvLinux::key_ima_ext_0_t *)&pair->value;
+            if (misa.rvf && misa.rvd) ext->FD = 1;
+            if (misa.rvc) ext->C = 1;
+            if (misa.rvv) ext->V = 1;
+            ext->ZBA = 1;
+            ext->ZBB = 1;
+            ext->ZBS = 1;
+            ext->ZICBOZ = 1;
+            ext->ZBC = 1;
+            ext->ZBKB = 1;
+            ext->ZBKC = 1;
+            ext->ZBKX = 1;
+            ext->ZKND = 1;
+            ext->ZKNE = 1;
+            ext->ZKNH = 1;
+            ext->ZKSED = 1;
+            ext->ZKSH = 1;
+            ext->ZKT = 1;
+            ext->ZFH = 1;
+            ext->ZFHMIN = 1;
+            ext->ZVFH = 1;
+            ext->ZVFHMIN = 1;
+            ext->ZICOND = 1;
+            ext->ZVE64D = 1;
+            ext->ZCB = 1;
+            ext->ZCD = 1;
+            ext->ZCF = 1;
+        }
+        break;
+    case RiscvLinux::Cpuperf0:
+    case RiscvLinux::MisalignedScalarPerf:
+        pair->value = RiscvLinux::Slow;
+        break;
+    case RiscvLinux::ZicbozBlockSize:
+        pair->value = tc->getSystemPtr()->cacheLineSize();
+        break;
+    case RiscvLinux::HighestVirtAddress:
+        pair->value = tc->getProcessPtr()->memState->getMmapEnd();
+        break;
+
+    /*
+     * For forward compatibility, unknown keys don't fail the whole
+     * call, but get their element key set to -1 and value set to 0
+     * indicating they're unrecognized.
+     */
+    default:
+        pair->key = -1;
+        pair->value = 0;
+        break;
+    }
+}
+
+template <class OS>
+static int
+hwprobe_get_values(ThreadContext *tc, VPtr<> pairs, typename OS::size_t pair_count,
+                   typename OS::size_t cpusetsize, VPtr<> cpus_user, unsigned int flags)
+{
+    /* Check the reserved flags. */
+    if (flags != 0) {
+        return -EINVAL;
+    }
+
+    RiscvLinux::cpumask_t *cpu_online_mask = get_cpu_online_mask(tc);
+    if (cpu_online_mask == nullptr) {
+        return -ENOMEM;
+    }
+
+    RiscvLinux::cpumask_t *cpus = cpumask_malloc(tc);
+    if (cpus == nullptr) {
+        cpumask_free(cpu_online_mask);
+        return -ENOMEM;
+    }
+
+    if (cpusetsize > cpu_online_mask->size) {
+        cpusetsize = cpu_online_mask->size;
+    }
+
+    RiscvLinux::riscv_hwprobe *pair;
+    BufferArg pairs_buf(pairs, sizeof(RiscvLinux::riscv_hwprobe) * pair_count);
+
+    /*
+    * The interface supports taking in a CPU mask, and returns values that
+    * are consistent across that mask. Allow userspace to specify NULL and
+    * 0 as a shortcut to all online CPUs.
+    */
+    if (cpusetsize == 0 && !cpus_user) {
+        cpumask_copy(cpus, cpu_online_mask);
+        cpusetsize = cpu_online_mask->size;
+    } else {
+        BufferArg cpus_user_buf(cpus_user, cpusetsize);
+        cpus_user_buf.copyIn(SETranslatingPortProxy(tc));
+
+        cpu_online_mask->size = cpusetsize;
+        cpus->size = cpusetsize;
+        memcpy(cpus->bits, cpus_user_buf.bufferPtr(), cpusetsize);
+
+        /*
+        * Userspace must provide at least one online CPU, without that
+        * there's no way to define what is supported.
+        */
+        cpumask_and(cpus, cpus, cpu_online_mask);
+        if (cpumask_empty(cpus)) {
+            cpumask_free(cpu_online_mask);
+            cpumask_free(cpus);
+            return -EINVAL;
+        }
+    }
+
+    pairs_buf.copyIn(SETranslatingPortProxy(tc));
+    pair = (RiscvLinux::riscv_hwprobe *)pairs_buf.bufferPtr();
+
+    for (size_t i = 0; i < pair_count; i++, pair++) {
+        pair->value = 0;
+        hwprobe_one_pair(tc, pair, cpus);
+    }
+
+    pairs_buf.copyOut(SETranslatingPortProxy(tc));
+
+    cpumask_free(cpu_online_mask);
+    cpumask_free(cpus);
+
+    return 0;
+}
+
+template <class OS>
+static int
+hwprobe_get_cpus(ThreadContext *tc, VPtr<> pairs, typename OS::size_t pair_count,
+                 typename OS::size_t cpusetsize, VPtr<> cpus_user, unsigned int flags)
+{
+    if (flags != RISCV_HWPROBE_WHICH_CPUS) {
+        return -EINVAL;
+    }
+
+    if (cpusetsize == 0 || !cpus_user) {
+        return -EINVAL;
+    }
+
+    RiscvLinux::cpumask_t *cpu_online_mask = get_cpu_online_mask(tc);
+    if (cpu_online_mask == nullptr) {
+        return -ENOMEM;
+    }
+
+    RiscvLinux::cpumask_t *cpus = cpumask_malloc(tc);
+    if (cpus == nullptr) {
+        cpumask_free(cpu_online_mask);
+        return -ENOMEM;
+    }
+
+    RiscvLinux::cpumask_t *one_cpu = cpumask_malloc(tc);
+    if (one_cpu == nullptr) {
+        cpumask_free(cpu_online_mask);
+        cpumask_free(cpus);
+        return -ENOMEM;
+    }
+
+    if (cpusetsize > cpu_online_mask->size) {
+        cpusetsize = cpu_online_mask->size;
+    }
+
+    RiscvLinux::riscv_hwprobe *pair;
+    BufferArg cpus_user_buf(cpus_user, cpusetsize);
+    cpus_user_buf.copyIn(SETranslatingPortProxy(tc));
+    memcpy(cpus->bits, cpus_user_buf.bufferPtr(), cpusetsize);
+
+    if (cpumask_empty(cpus)) {
+        cpumask_copy(cpus, cpu_online_mask);
+        cpusetsize = cpu_online_mask->size;
+    }
+
+    cpumask_and(cpus, cpus, cpu_online_mask);
+
+    BufferArg pairs_buf(pairs, sizeof(RiscvLinux::riscv_hwprobe) * pair_count);
+    pairs_buf.copyIn(SETranslatingPortProxy(tc));
+    pair = (RiscvLinux::riscv_hwprobe *)pairs_buf.bufferPtr();
+
+    for (size_t i = 0; i < pair_count; i++, pair++) {
+        if (!riscv_hwprobe_key_is_valid(pair->key)) {
+            *pair = (RiscvLinux::riscv_hwprobe){ .key = -1, .value = 0 };
+            memset(cpus_user_buf.bufferPtr(), 0, cpusetsize);
+            break;
+        }
+
+        RiscvLinux::riscv_hwprobe tmp =
+            (RiscvLinux::riscv_hwprobe){ .key = pair->key, .value = 0 };
+
+        for (int cpu = 0; cpu < cpusetsize * 8; cpu++) {
+            if (!cpumask_test_cpu(cpu, cpus)) {
+                continue;
+            }
+
+            cpumask_set_cpu(cpu, one_cpu);
+
+            hwprobe_one_pair(tc, &tmp, one_cpu);
+
+            if (!riscv_hwprobe_pair_cmp(&tmp, pair)) {
+                cpumask_clear_cpu(cpu, cpus);
+            }
+
+            cpumask_clear_cpu(cpu, one_cpu);
+        }
+    }
+
+    pairs_buf.copyOut(SETranslatingPortProxy(tc));
+    cpus_user_buf.copyOut(SETranslatingPortProxy(tc));
+
+    cpumask_free(cpu_online_mask);
+    cpumask_free(cpus);
+    cpumask_free(one_cpu);
+
+    return 0;
+}
+
+template <class OS>
+static SyscallReturn
+riscvHWProbeFunc(SyscallDesc *desc, ThreadContext *tc, VPtr<> pairs,
+                 typename OS::size_t pair_count, typename OS::size_t cpusetsize,
+                 VPtr<> cpus_user, unsigned int flags)
+{
+    if (flags & RISCV_HWPROBE_WHICH_CPUS) {
+        return hwprobe_get_cpus<OS>(tc, pairs, pair_count, cpusetsize,
+                                    cpus_user, flags);
+    }
+
+    return hwprobe_get_values<OS>(tc, pairs, pair_count, cpusetsize,
+                                  cpus_user, flags);
+}
+
 SyscallDescTable<SEWorkload::SyscallABI64> EmuLinux::syscallDescs64 = {
     { 0,    "io_setup" },
     { 1,    "io_destroy" },
@@ -382,6 +766,7 @@ SyscallDescTable<SEWorkload::SyscallABI64> EmuLinux::syscallDescs64 = {
     { 241,  "perf_event_open" },
     { 242,  "accept4" },
     { 243,  "recvmmsg" },
+    { 258,  "riscv_hwprobe", riscvHWProbeFunc<RiscvLinux64> },
     { 260,  "wait4", wait4Func<RiscvLinux64> },
     { 261,  "prlimit64", prlimitFunc<RiscvLinux64> },
     { 262,  "fanotify_init" },
@@ -410,6 +795,33 @@ SyscallDescTable<SEWorkload::SyscallABI64> EmuLinux::syscallDescs64 = {
     { 285,  "copy_file_range" },
     { 286,  "preadv2" },
     { 287,  "pwritev2" },
+    { 424,  "pidfd_send_signal" },
+    { 425,  "io_uring_setup" },
+    { 426,  "io_uring_enter" },
+    { 427,  "io_uring_register" },
+    { 428,  "open_tree" },
+    { 429,  "move_mount" },
+    { 430,  "fsopen" },
+    { 431,  "fsconfig" },
+    { 432,  "fsmount" },
+    { 433,  "fspick" },
+    { 434,  "pidfd_open" },
+    { 435,  "clone3", clone3Func<RiscvLinux64> },
+    { 436,  "close_range" },
+    { 437,  "openat2" },
+    { 438,  "pidfd_getfd" },
+    { 439,  "faccessat2" },
+    { 440,  "process_madvise" },
+    { 441,  "epoll_pwait2" },
+    { 442,  "mount_setattr" },
+    { 443,  "quotactl_fd" },
+    { 444,  "landlock_create_ruleset" },
+    { 445,  "landlock_add_rule" },
+    { 446,  "landlock_restrict_self" },
+    { 447,  "memfd_secret" },
+    { 448,  "process_mrelease" },
+    { 449,  "futex_waitv" },
+    { 450,  "set_mempolicy_home_node" },
     { 1024, "open", openFunc<RiscvLinux64> },
     { 1025, "link", linkFunc },
     { 1026, "unlink", unlinkFunc },
@@ -721,6 +1133,7 @@ SyscallDescTable<SEWorkload::SyscallABI32> EmuLinux::syscallDescs32 = {
     { 241,  "perf_event_open" },
     { 242,  "accept4" },
     { 243,  "recvmmsg" },
+    { 258,  "riscv_hwprobe", riscvHWProbeFunc<RiscvLinux32> },
     { 260,  "wait4", wait4Func<RiscvLinux32> },
     { 261,  "prlimit64", prlimitFunc<RiscvLinux32> },
     { 262,  "fanotify_init" },
diff --git a/src/arch/riscv/regs/int.hh b/src/arch/riscv/regs/int.hh
index 4ac01c60c1..dc7e37cdbe 100644
--- a/src/arch/riscv/regs/int.hh
+++ b/src/arch/riscv/regs/int.hh
@@ -149,6 +149,18 @@ inline constexpr RegId ArgumentRegs[] = {
     int_reg::A4, int_reg::A5, int_reg::A6, int_reg::A7
 };
 
+const std::vector<RegId> PushPopRegList = {
+    int_reg::S11, int_reg::S10, int_reg::S9, int_reg::S8,
+    int_reg::S7, int_reg::S6, int_reg::S5, int_reg::S4,
+    int_reg::S3, int_reg::S2, int_reg::S1, int_reg::S0,
+    int_reg::Ra
+};
+
+inline constexpr RegId StackRegs[] = {
+  int_reg::S0, int_reg::S1, int_reg::S2, int_reg::S3,
+  int_reg::S4, int_reg::S5, int_reg::S6, int_reg::S7,
+};
+
 } // namespace RiscvISA
 } // namespace gem5
 
diff --git a/src/arch/riscv/types.hh b/src/arch/riscv/types.hh
index c7edffc2f7..8b72c782a9 100644
--- a/src/arch/riscv/types.hh
+++ b/src/arch/riscv/types.hh
@@ -58,6 +58,7 @@ BitUnion64(ExtMachInst)
     // Decoder state
     Bitfield<63, 62>    rv_type;
     Bitfield<61>        compressed;
+    Bitfield<60>        enable_zcd;
     // More bits for vector extension
     Bitfield<57, 41>    vl;     // [0, 2**16]
     Bitfield<40>        vill;
@@ -126,6 +127,8 @@ BitUnion64(ExtMachInst)
     Bitfield< 6,  2>    rc2;
     Bitfield< 9,  7>    rp1;
     Bitfield< 4,  2>    rp2;
+    Bitfield< 9,  7>    r1s;
+    Bitfield< 4,  2>    r2s;
     Bitfield<11,  7>    fc1;
     Bitfield< 6,  2>    fc2;
     Bitfield< 4,  2>    fp2;
@@ -144,6 +147,8 @@ BitUnion64(ExtMachInst)
     Bitfield<12, 10>    cimm3;
     Bitfield< 6,  5>    cimm2;
     Bitfield<12>        cimm1;
+    Bitfield< 7,  4>    rlist;
+    Bitfield< 3,  2>    spimm;
     // Pseudo instructions
     Bitfield<31, 25>    m5func;
     // vector
diff --git a/src/arch/x86/decoder.cc b/src/arch/x86/decoder.cc
index af2456d6ab..ec595856a2 100644
--- a/src/arch/x86/decoder.cc
+++ b/src/arch/x86/decoder.cc
@@ -41,8 +41,6 @@ namespace gem5
 namespace X86ISA
 {
 
-X86ISAInst::MicrocodeRom Decoder::microcodeRom;
-
 Decoder::State
 Decoder::doResetState()
 {
@@ -671,9 +669,6 @@ Decoder::doImmediateState()
     return nextState;
 }
 
-Decoder::InstBytes Decoder::dummy;
-Decoder::InstCacheMap Decoder::instCacheMap;
-
 StaticInstPtr
 Decoder::decode(ExtMachInst mach_inst, Addr addr)
 {
diff --git a/src/arch/x86/decoder.hh b/src/arch/x86/decoder.hh
index e4b1de96d7..eee48c1f76 100644
--- a/src/arch/x86/decoder.hh
+++ b/src/arch/x86/decoder.hh
@@ -60,19 +60,19 @@ class Decoder : public InstDecoder
     // These are defined and documented in decoder_tables.cc
     static const uint8_t SizeTypeToSize[3][10];
     typedef const uint8_t ByteTable[256];
-    static ByteTable Prefixes[2];
+    static const ByteTable Prefixes[2];
 
-    static ByteTable UsesModRMOneByte;
-    static ByteTable UsesModRMTwoByte;
-    static ByteTable UsesModRMThreeByte0F38;
-    static ByteTable UsesModRMThreeByte0F3A;
+    static const ByteTable UsesModRMOneByte;
+    static const ByteTable UsesModRMTwoByte;
+    static const ByteTable UsesModRMThreeByte0F38;
+    static const ByteTable UsesModRMThreeByte0F3A;
 
-    static ByteTable ImmediateTypeOneByte;
-    static ByteTable ImmediateTypeTwoByte;
-    static ByteTable ImmediateTypeThreeByte0F38;
-    static ByteTable ImmediateTypeThreeByte0F3A;
+    static const ByteTable ImmediateTypeOneByte;
+    static const ByteTable ImmediateTypeTwoByte;
+    static const ByteTable ImmediateTypeThreeByte0F38;
+    static const ByteTable ImmediateTypeThreeByte0F3A;
 
-    static X86ISAInst::MicrocodeRom microcodeRom;
+    X86ISAInst::MicrocodeRom microcodeRom;
 
   protected:
     using MachInst = uint64_t;
@@ -88,7 +88,7 @@ class Decoder : public InstDecoder
         {}
     };
 
-    static InstBytes dummy;
+    InstBytes dummy;
 
     // The bytes to be predecoded.
     MachInst fetchChunk;
@@ -244,7 +244,7 @@ class Decoder : public InstDecoder
     decode_cache::InstMap<ExtMachInst> *instMap = nullptr;
     typedef std::unordered_map<
             CacheKey, decode_cache::InstMap<ExtMachInst> *> InstCacheMap;
-    static InstCacheMap instCacheMap;
+    InstCacheMap instCacheMap;
 
     StaticInstPtr decodeInst(ExtMachInst mach_inst);
 
diff --git a/src/base/stats/units.hh b/src/base/stats/units.hh
index 1d7d640ddb..eb4bcd240f 100644
--- a/src/base/stats/units.hh
+++ b/src/base/stats/units.hh
@@ -350,9 +350,9 @@ class Rate : public Base
         "otherwise, it would be a Ratio");
 
   private:
-    Rate<T1,T2>() {}
+    Rate() {}
   public:
-    Rate<T1,T2>(Rate<T1,T2> const&) = delete;
+    Rate(Rate const&) = delete;
     void operator=(Rate<T1,T2> const&) = delete;
     static Rate<T1,T2>*
     get()
diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index ec219aa9f1..cc093e7000 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -240,7 +240,11 @@ BaseCPU::postInterrupt(ThreadID tid, int int_num, int index)
     // Only wake up syscall emulation if it is not waiting on a futex.
     // This is to model the fact that instructions such as ARM SEV
     // should wake up a WFE sleep, but not a futex syscall WAIT.
-    if (FullSystem || !system->futexMap.is_waiting(threadContexts[tid]))
+    //
+    // For RISC-V, the WFI sleep wake up is implementation defined.
+    // The SiFive WFI wake up the hart only if mip & mie != 0
+    if ((FullSystem && interrupts[tid]->isWakeUp()) ||
+        !system->futexMap.is_waiting(threadContexts[tid]))
         wakeup(tid);
 }
 
@@ -855,13 +859,13 @@ BaseCPU::GlobalStats::GlobalStats(statistics::Group *parent)
              "Simulator op (including micro ops) rate (op/s)")
 {
     simInsts
-        .functor(BaseCPU::numSimulatedInsts)
+        .functor(BaseCPU::GlobalStats::numSimulatedInsts)
         .precision(0)
         .prereq(simInsts)
         ;
 
     simOps
-        .functor(BaseCPU::numSimulatedOps)
+        .functor(BaseCPU::GlobalStats::numSimulatedOps)
         .precision(0)
         .prereq(simOps)
         ;
diff --git a/src/cpu/base.hh b/src/cpu/base.hh
index 0be0eda344..28cd90f3e2 100644
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -156,6 +156,30 @@ class BaseCPU : public ClockedObject
 
         statistics::Formula hostInstRate;
         statistics::Formula hostOpRate;
+
+        Counter previousInsts = 0;
+        Counter previousOps = 0;
+
+        static Counter
+        numSimulatedInsts()
+        {
+            return totalNumSimulatedInsts() - (globalStats->previousInsts);
+        }
+
+        static Counter
+        numSimulatedOps()
+        {
+            return totalNumSimulatedOps() - (globalStats->previousOps);
+        }
+
+        void
+        resetStats() override
+        {
+            previousInsts = totalNumSimulatedInsts();
+            previousOps = totalNumSimulatedOps();
+
+            statistics::Group::resetStats();
+        }
     };
 
     /**
@@ -609,7 +633,7 @@ class BaseCPU : public ClockedObject
 
     static int numSimulatedCPUs() { return cpuList.size(); }
     static Counter
-    numSimulatedInsts()
+    totalNumSimulatedInsts()
     {
         Counter total = 0;
 
@@ -621,7 +645,7 @@ class BaseCPU : public ClockedObject
     }
 
     static Counter
-    numSimulatedOps()
+    totalNumSimulatedOps()
     {
         Counter total = 0;
 
diff --git a/src/cpu/o3/FUPool.py b/src/cpu/o3/FUPool.py
index 67f523787b..b82b450700 100644
--- a/src/cpu/o3/FUPool.py
+++ b/src/cpu/o3/FUPool.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 ARM Limited
+# Copyright (c) 2017, 2024 Arm Limited
 # All rights reserved
 #
 # The license below extends only to copyright in the software and shall
@@ -57,6 +57,7 @@ class DefaultFUPool(FUPool):
         FP_MultDiv(),
         ReadPort(),
         SIMD_Unit(),
+        Matrix_Unit(),
         PredALU(),
         WritePort(),
         RdWrPort(),
diff --git a/src/cpu/o3/FuncUnitConfig.py b/src/cpu/o3/FuncUnitConfig.py
index ab01b4aa27..5606046f5e 100644
--- a/src/cpu/o3/FuncUnitConfig.py
+++ b/src/cpu/o3/FuncUnitConfig.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2010, 2017, 2020 ARM Limited
+# Copyright (c) 2010, 2017, 2020, 2024 Arm Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -109,10 +109,27 @@ class SIMD_Unit(FUDesc):
         OpDesc(opClass="SimdExt"),
         OpDesc(opClass="SimdFloatExt"),
         OpDesc(opClass="SimdConfig"),
+        OpDesc(opClass="SimdAes"),
+        OpDesc(opClass="SimdAesMix"),
+        OpDesc(opClass="SimdSha1Hash"),
+        OpDesc(opClass="SimdSha1Hash2"),
+        OpDesc(opClass="SimdSha256Hash"),
+        OpDesc(opClass="SimdSha256Hash2"),
+        OpDesc(opClass="SimdShaSigma2"),
+        OpDesc(opClass="SimdShaSigma3"),
     ]
     count = 4
 
 
+class Matrix_Unit(FUDesc):
+    opList = [
+        OpDesc(opClass="Matrix"),
+        OpDesc(opClass="MatrixMov"),
+        OpDesc(opClass="MatrixOP"),
+    ]
+    count = 1
+
+
 class PredALU(FUDesc):
     opList = [OpDesc(opClass="SimdPredAlu")]
     count = 1
diff --git a/src/cpu/o3/probe/elastic_trace.cc b/src/cpu/o3/probe/elastic_trace.cc
index a56ef17749..2988e83038 100644
--- a/src/cpu/o3/probe/elastic_trace.cc
+++ b/src/cpu/o3/probe/elastic_trace.cc
@@ -122,7 +122,7 @@ ElasticTrace::regEtraceListeners()
 {
     assert(!allProbesReg);
     inform("@%llu: No. of instructions committed = %llu, registering elastic"
-        " probe listeners", curTick(), cpu->numSimulatedInsts());
+        " probe listeners", curTick(), cpu->totalNumSimulatedInsts());
     // Create new listeners: provide method to be called upon a notify() for
     // each probe point.
     listeners.push_back(new ProbeListenerArg<ElasticTrace, RequestPtr>(this,
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index a10b2c2cef..5b90826315 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -38,6 +38,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.objects.ClockedObject import ClockedObject
+from m5.objects.IndexingPolicies import *
+from m5.objects.ReplacementPolicies import *
 from m5.params import *
 from m5.proxy import *
 from m5.SimObject import *
@@ -83,6 +85,38 @@ class BranchTargetBuffer(ClockedObject):
     numThreads = Param.Unsigned(Parent.numThreads, "Number of threads")
 
 
+class BTBIndexingPolicy(SimObject):
+    type = "BTBIndexingPolicy"
+    abstract = True
+    cxx_class = "gem5::IndexingPolicyTemplate<gem5::BTBTagType>"
+    cxx_header = "cpu/pred/btb_entry.hh"
+    cxx_template_params = ["class Types"]
+
+    # Get the associativity
+    assoc = Param.Int(Parent.assoc, "associativity")
+
+
+class BTBSetAssociative(BTBIndexingPolicy):
+    type = "BTBSetAssociative"
+    cxx_class = "gem5::BTBSetAssociative"
+    cxx_header = "cpu/pred/btb_entry.hh"
+
+    # Get the number of entries in the BTB from the parent
+    num_entries = Param.Unsigned(
+        Parent.numEntries, "Number of entries in the BTB"
+    )
+
+    # Set shift for the index. Ignore lower 2 bits for a 4 byte instruction.
+    set_shift = Param.Unsigned(2, "Number of bits to shift PC to get index")
+
+    # Total number of bits in the tag.
+    # This is above the index and offset bit
+    tag_bits = Param.Unsigned(64, "number of bits in the tag")
+
+    # Number of threads sharing the BTB
+    numThreads = Param.Unsigned(Parent.numThreads, "Number of threads")
+
+
 class SimpleBTB(BranchTargetBuffer):
     type = "SimpleBTB"
     cxx_class = "gem5::branch_prediction::SimpleBTB"
@@ -93,6 +127,19 @@ class SimpleBTB(BranchTargetBuffer):
     instShiftAmt = Param.Unsigned(
         Parent.instShiftAmt, "Number of bits to shift instructions by"
     )
+    associativity = Param.Unsigned(1, "BTB associativity")
+    btbReplPolicy = Param.BaseReplacementPolicy(
+        LRURP(), "BTB replacement policy"
+    )
+    btbIndexingPolicy = Param.BTBIndexingPolicy(
+        BTBSetAssociative(
+            assoc=Parent.associativity,
+            num_entries=Parent.numEntries,
+            set_shift=Parent.instShiftAmt,
+            numThreads=1,
+        ),
+        "BTB indexing policy",
+    )
 
 
 class IndirectPredictor(SimObject):
diff --git a/src/cpu/pred/SConscript b/src/cpu/pred/SConscript
index ec3102cada..6c03dd8a1b 100644
--- a/src/cpu/pred/SConscript
+++ b/src/cpu/pred/SConscript
@@ -45,7 +45,7 @@ SimObject('BranchPredictor.py',
     sim_objects=[
     'BranchPredictor',
     'IndirectPredictor', 'SimpleIndirectPredictor',
-    'BranchTargetBuffer', 'SimpleBTB',
+    'BranchTargetBuffer', 'SimpleBTB', 'BTBIndexingPolicy', 'BTBSetAssociative',
     'ReturnAddrStack',
     'LocalBP', 'TournamentBP', 'BiModeBP', 'TAGEBase', 'TAGE', 'LoopPredictor',
     'TAGE_SC_L_TAGE', 'TAGE_SC_L_TAGE_64KB', 'TAGE_SC_L_TAGE_8KB',
diff --git a/src/cpu/pred/btb_entry.hh b/src/cpu/pred/btb_entry.hh
new file mode 100644
index 0000000000..a445ac4775
--- /dev/null
+++ b/src/cpu/pred/btb_entry.hh
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2024 Pranith Kumar
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Declaration of a BTB entry and BTB indexing policy.
+ */
+
+#ifndef __CPU_PRED_BTB_ENTRY_HH__
+#define __CPU_PRED_BTB_ENTRY_HH__
+
+#include <vector>
+
+#include "arch/generic/pcstate.hh"
+#include "base/intmath.hh"
+#include "base/types.hh"
+#include "cpu/static_inst.hh"
+#include "mem/cache/replacement_policies/replaceable_entry.hh"
+#include "mem/cache/tags/indexing_policies/base.hh"
+#include "params/BTBIndexingPolicy.hh"
+#include "params/BTBSetAssociative.hh"
+
+namespace gem5 {
+
+class BTBTagType
+{
+  public:
+    struct KeyType
+    {
+        Addr address;
+        ThreadID tid;
+    };
+    using Params = BTBIndexingPolicyParams;
+};
+
+using BTBIndexingPolicy = IndexingPolicyTemplate<BTBTagType>;
+template class IndexingPolicyTemplate<BTBTagType>;
+
+class BTBSetAssociative : public BTBIndexingPolicy
+{
+  public:
+    PARAMS(BTBSetAssociative);
+    using KeyType = BTBTagType::KeyType;
+
+    BTBSetAssociative(const Params &p)
+        : BTBIndexingPolicy(p, p.num_entries, p.set_shift),
+          tagMask(mask(p.tag_bits))
+    {
+        setNumThreads(p.numThreads);
+    }
+
+  protected:
+    /**
+     * Extract the set index for the instruction PC based on tid.
+     */
+    uint32_t
+    extractSet(const KeyType &key) const
+    {
+        return ((key.address >> setShift)
+                ^ (key.tid << (tagShift - setShift - log2NumThreads)))
+            & setMask;
+    }
+
+  public:
+    /**
+     * Find all possible entries for insertion and replacement of an address.
+     */
+    std::vector<ReplaceableEntry*>
+    getPossibleEntries(const KeyType &key) const override
+    {
+        auto set_idx = extractSet(key);
+
+        assert(set_idx < sets.size());
+
+        return sets[set_idx];
+    }
+
+    /**
+     * Set number of threads sharing the BTB
+     */
+    void
+    setNumThreads(unsigned num_threads)
+    {
+        log2NumThreads = log2i(num_threads);
+    }
+
+    /**
+     * Generate the tag from the given address.
+     */
+    Addr
+    extractTag(const Addr addr) const override
+    {
+        return (addr >> tagShift) & tagMask;
+    }
+
+    Addr regenerateAddr(const KeyType &key,
+                        const ReplaceableEntry* entry) const override
+    {
+        panic("Not implemented!");
+        return 0;
+    }
+
+  private:
+    const uint64_t tagMask;
+    unsigned log2NumThreads;
+};
+
+namespace branch_prediction
+{
+
+class BTBEntry : public ReplaceableEntry
+{
+  public:
+    using IndexingPolicy = gem5::BTBIndexingPolicy;
+    using KeyType = gem5::BTBTagType::KeyType;
+    using TagExtractor = std::function<Addr(Addr)>;
+
+    /** Default constructor */
+    BTBEntry(TagExtractor ext)
+        : inst(nullptr), extractTag(ext), valid(false), tag({MaxAddr, -1})
+    {}
+
+    /** Update the target and instruction in the BTB entry.
+     *  During insertion, only the tag (key) is updated.
+     */
+    void
+    update(const PCStateBase &_target,
+           StaticInstPtr _inst)
+    {
+        set(target, _target);
+        inst = _inst;
+    }
+
+    /**
+     * Checks if the given tag information corresponds to this entry's.
+     */
+    bool
+    match(const KeyType &key) const
+    {
+        return isValid() && (tag.address == extractTag(key.address))
+            && (tag.tid == key.tid);
+    }
+
+    /**
+     * Insert the block by assigning it a tag and marking it valid. Touches
+     * block if it hadn't been touched previously.
+     */
+    void
+    insert(const KeyType &key)
+    {
+        setValid();
+        setTag({extractTag(key.address), key.tid});
+    }
+
+    /** Copy constructor */
+    BTBEntry(const BTBEntry &other)
+    {
+        valid      = other.valid;
+        tag        = other.tag;
+        inst       = other.inst;
+        extractTag = other.extractTag;
+        set(target, other.target);
+    }
+
+    /** Assignment operator */
+    BTBEntry& operator=(const BTBEntry &other)
+    {
+        valid      = other.valid;
+        tag        = other.tag;
+        inst       = other.inst;
+        extractTag = other.extractTag;
+        set(target, other.target);
+
+        return *this;
+    }
+
+    /**
+     * Checks if the entry is valid.
+     */
+    bool isValid() const { return valid; }
+
+    /**
+     * Get tag associated to this block.
+     */
+    KeyType getTag() const { return tag; }
+
+    /** Invalidate the block. Its contents are no longer valid. */
+    void
+    invalidate()
+    {
+        valid = false;
+        setTag({MaxAddr, -1});
+    }
+
+    /** The entry's target. */
+    std::unique_ptr<PCStateBase> target;
+
+    /** Pointer to the static branch inst at this address */
+    StaticInstPtr inst;
+
+    std::string
+    print() const override
+    {
+        return csprintf("tag: %#x tid: %d valid: %d | %s", tag.address, tag.tid,
+                        isValid(), ReplaceableEntry::print());
+    }
+
+  protected:
+    /**
+     * Set tag associated to this block.
+     */
+    void setTag(KeyType _tag) { tag = _tag; }
+
+    /** Set valid bit. The block must be invalid beforehand. */
+    void
+    setValid()
+    {
+        assert(!isValid());
+        valid = true;
+    }
+
+  private:
+    /** Callback used to extract the tag from the entry */
+    TagExtractor extractTag;
+
+    /**
+     * Valid bit. The contents of this entry are only valid if this bit is set.
+     * @sa invalidate()
+     * @sa insert()
+     */
+    bool valid;
+
+    /** The entry's tag. */
+    KeyType tag;
+};
+
+} // namespace gem5::branch_prediction
+/**
+ * This helper generates a tag extractor function object
+ * which will be typically used by Replaceable entries indexed
+ * with the BaseIndexingPolicy.
+ * It allows to "decouple" indexing from tagging. Those entries
+ * would call the functor without directly holding a pointer
+ * to the indexing policy which should reside in the cache.
+ */
+static constexpr auto
+genTagExtractor(BTBIndexingPolicy *ip)
+{
+    return [ip] (Addr addr) { return ip->extractTag(addr); };
+}
+
+}
+
+#endif //__CPU_PRED_BTB_ENTRY_HH__
diff --git a/src/cpu/pred/simple_btb.cc b/src/cpu/pred/simple_btb.cc
index c78caac7a8..0260ced8b3 100644
--- a/src/cpu/pred/simple_btb.cc
+++ b/src/cpu/pred/simple_btb.cc
@@ -44,84 +44,38 @@
 #include "base/trace.hh"
 #include "debug/BTB.hh"
 
-namespace gem5
-{
-
-namespace branch_prediction
+namespace gem5::branch_prediction
 {
 
 SimpleBTB::SimpleBTB(const SimpleBTBParams &p)
     : BranchTargetBuffer(p),
-        numEntries(p.numEntries),
-        tagBits(p.tagBits),
-        instShiftAmt(p.instShiftAmt),
-        log2NumThreads(floorLog2(p.numThreads))
+      btb("simpleBTB", p.numEntries, p.associativity,
+          p.btbReplPolicy, p.btbIndexingPolicy,
+          BTBEntry(genTagExtractor(p.btbIndexingPolicy)))
 {
     DPRINTF(BTB, "BTB: Creating BTB object.\n");
 
-    if (!isPowerOf2(numEntries)) {
+    if (!isPowerOf2(p.numEntries)) {
         fatal("BTB entries is not a power of 2!");
     }
-
-    btb.resize(numEntries);
-
-    for (unsigned i = 0; i < numEntries; ++i) {
-        btb[i].valid = false;
-    }
-
-    idxMask = numEntries - 1;
-
-    tagMask = (1 << tagBits) - 1;
-
-    tagShiftAmt = instShiftAmt + floorLog2(numEntries);
 }
 
 void
 SimpleBTB::memInvalidate()
 {
-    for (unsigned i = 0; i < numEntries; ++i) {
-        btb[i].valid = false;
-    }
+    btb.clear();
 }
 
-inline
-unsigned
-SimpleBTB::getIndex(Addr instPC, ThreadID tid)
-{
-    // Need to shift PC over by the word offset.
-    return ((instPC >> instShiftAmt)
-            ^ (tid << (tagShiftAmt - instShiftAmt - log2NumThreads)))
-            & idxMask;
-}
-
-inline
-Addr
-SimpleBTB::getTag(Addr instPC)
-{
-    return (instPC >> tagShiftAmt) & tagMask;
-}
-
-SimpleBTB::BTBEntry *
+BTBEntry *
 SimpleBTB::findEntry(Addr instPC, ThreadID tid)
 {
-    unsigned btb_idx = getIndex(instPC, tid);
-    Addr inst_tag = getTag(instPC);
-
-    assert(btb_idx < numEntries);
-
-    if (btb[btb_idx].valid
-        && inst_tag == btb[btb_idx].tag
-        && btb[btb_idx].tid == tid) {
-        return &btb[btb_idx];
-    }
-
-    return nullptr;
+    return btb.findEntry({instPC, tid});
 }
 
 bool
 SimpleBTB::valid(ThreadID tid, Addr instPC)
 {
-    BTBEntry *entry = findEntry(instPC, tid);
+    BTBEntry *entry = btb.findEntry({instPC, tid});
 
     return entry != nullptr;
 }
@@ -134,11 +88,12 @@ SimpleBTB::lookup(ThreadID tid, Addr instPC, BranchType type)
 {
     stats.lookups[type]++;
 
-    BTBEntry *entry = findEntry(instPC, tid);
+    BTBEntry *entry = btb.accessEntry({instPC, tid});
 
     if (entry) {
         return entry->target.get();
     }
+
     stats.misses[type]++;
     return nullptr;
 }
@@ -146,31 +101,27 @@ SimpleBTB::lookup(ThreadID tid, Addr instPC, BranchType type)
 const StaticInstPtr
 SimpleBTB::getInst(ThreadID tid, Addr instPC)
 {
-    BTBEntry *entry = findEntry(instPC, tid);
+    BTBEntry *entry = btb.findEntry({instPC, tid});
 
     if (entry) {
         return entry->inst;
     }
+
     return nullptr;
 }
 
 void
 SimpleBTB::update(ThreadID tid, Addr instPC,
-                    const PCStateBase &target,
-                    BranchType type, StaticInstPtr inst)
+                  const PCStateBase &target,
+                  BranchType type, StaticInstPtr inst)
 {
-    unsigned btb_idx = getIndex(instPC, tid);
-
-    assert(btb_idx < numEntries);
-
     stats.updates[type]++;
 
-    btb[btb_idx].tid = tid;
-    btb[btb_idx].valid = true;
-    set(btb[btb_idx].target, target);
-    btb[btb_idx].tag = getTag(instPC);
-    btb[btb_idx].inst = inst;
+    BTBEntry *victim = btb.findVictim({instPC, tid});
+
+    btb.insertEntry({instPC, tid}, victim);
+    victim->update(target, inst);
 }
 
-} // namespace branch_prediction
-} // namespace gem5
+
+} // namespace gem5::branch_prediction
diff --git a/src/cpu/pred/simple_btb.hh b/src/cpu/pred/simple_btb.hh
index 3c76890348..b1ef2a9fa5 100644
--- a/src/cpu/pred/simple_btb.hh
+++ b/src/cpu/pred/simple_btb.hh
@@ -41,15 +41,16 @@
 #ifndef __CPU_PRED_SIMPLE_BTB_HH__
 #define __CPU_PRED_SIMPLE_BTB_HH__
 
+#include "base/cache/associative_cache.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
 #include "cpu/pred/btb.hh"
+#include "cpu/pred/btb_entry.hh"
+#include "mem/cache/replacement_policies/replaceable_entry.hh"
+#include "mem/cache/tags/indexing_policies/base.hh"
 #include "params/SimpleBTB.hh"
 
-namespace gem5
-{
-
-namespace branch_prediction
+namespace gem5::branch_prediction
 {
 
 class SimpleBTB : public BranchTargetBuffer
@@ -60,44 +61,13 @@ class SimpleBTB : public BranchTargetBuffer
     void memInvalidate() override;
     bool valid(ThreadID tid, Addr instPC) override;
     const PCStateBase *lookup(ThreadID tid, Addr instPC,
-                           BranchType type = BranchType::NoBranch) override;
+                              BranchType type = BranchType::NoBranch) override;
     void update(ThreadID tid, Addr instPC, const PCStateBase &target_pc,
-                           BranchType type = BranchType::NoBranch,
-                           StaticInstPtr inst = nullptr) override;
+                BranchType type = BranchType::NoBranch,
+                StaticInstPtr inst = nullptr) override;
     const StaticInstPtr getInst(ThreadID tid, Addr instPC) override;
 
-
   private:
-    struct BTBEntry
-    {
-        /** The entry's tag. */
-        Addr tag = 0;
-
-        /** The entry's target. */
-        std::unique_ptr<PCStateBase> target;
-
-        /** The entry's thread id. */
-        ThreadID tid;
-
-        /** Whether or not the entry is valid. */
-        bool valid = false;
-
-        /** Pointer to the static branch instruction at this address */
-        StaticInstPtr inst = nullptr;
-    };
-
-
-    /** Returns the index into the BTB, based on the branch's PC.
-     *  @param inst_PC The branch to look up.
-     *  @return Returns the index into the BTB.
-     */
-    inline unsigned getIndex(Addr instPC, ThreadID tid);
-
-    /** Returns the tag bits of a given address.
-     *  @param inst_PC The branch's address.
-     *  @return Returns the tag bits.
-     */
-    inline Addr getTag(Addr instPC);
 
     /** Internal call to find an address in the BTB
      * @param instPC The branch's address.
@@ -106,31 +76,9 @@ class SimpleBTB : public BranchTargetBuffer
     BTBEntry *findEntry(Addr instPC, ThreadID tid);
 
     /** The actual BTB. */
-    std::vector<BTBEntry> btb;
-
-    /** The number of entries in the BTB. */
-    unsigned numEntries;
-
-    /** The index mask. */
-    unsigned idxMask;
-
-    /** The number of tag bits per entry. */
-    unsigned tagBits;
-
-    /** The tag mask. */
-    unsigned tagMask;
-
-    /** Number of bits to shift PC when calculating index. */
-    unsigned instShiftAmt;
-
-    /** Number of bits to shift PC when calculating tag. */
-    unsigned tagShiftAmt;
-
-    /** Log2 NumThreads used for hashing threadid */
-    unsigned log2NumThreads;
+    AssociativeCache<BTBEntry> btb;
 };
 
-} // namespace branch_prediction
-} // namespace gem5
+} // namespace gem5::branch_prediction
 
 #endif // __CPU_PRED_SIMPLE_BTB_HH__
diff --git a/src/cpu/testers/gpu_ruby_test/TesterThread.py b/src/cpu/testers/gpu_ruby_test/TesterThread.py
index 49388a76e1..6ddfc66ddc 100644
--- a/src/cpu/testers/gpu_ruby_test/TesterThread.py
+++ b/src/cpu/testers/gpu_ruby_test/TesterThread.py
@@ -41,3 +41,4 @@ class TesterThread(ClockedObject):
     thread_id = Param.Int("Unique TesterThread ID")
     num_lanes = Param.Int("Number of lanes this thread has")
     deadlock_threshold = Param.Cycles(1000000000, "Deadlock threshold")
+    cache_line_size = Param.UInt32("Size of cache line in cache")
diff --git a/src/cpu/testers/gpu_ruby_test/address_manager.cc b/src/cpu/testers/gpu_ruby_test/address_manager.cc
index a0c0670a8f..83d8a1a277 100644
--- a/src/cpu/testers/gpu_ruby_test/address_manager.cc
+++ b/src/cpu/testers/gpu_ruby_test/address_manager.cc
@@ -64,7 +64,9 @@ AddressManager::AddressManager(int n_atomic_locs, int n_normal_locs_per_atomic)
     std::shuffle(
         randAddressMap.begin(),
         randAddressMap.end(),
-        std::default_random_engine(random_mt.random<unsigned>(0,UINT_MAX))
+        // TODO: This is a bug unrelated to this draft PR but the GPU tester is
+        // useful for testing this PR.
+        std::default_random_engine(random_mt.random<unsigned>(0,UINT_MAX-1))
     );
 
     // initialize atomic locations
diff --git a/src/cpu/testers/gpu_ruby_test/dma_thread.cc b/src/cpu/testers/gpu_ruby_test/dma_thread.cc
index 1d6f46c44b..2c4c610c51 100644
--- a/src/cpu/testers/gpu_ruby_test/dma_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/dma_thread.cc
@@ -70,7 +70,7 @@ DmaThread::issueLoadOps()
         Addr address = addrManager->getAddress(location);
         DPRINTF(ProtocolTest, "%s Episode %d: Issuing Load - Addr %s\n",
                 this->getName(), curEpisode->getEpisodeId(),
-                ruby::printAddress(address));
+                printAddress(address));
 
         int load_size = sizeof(Value);
 
@@ -127,7 +127,7 @@ DmaThread::issueStoreOps()
 
         DPRINTF(ProtocolTest, "%s Episode %d: Issuing Store - Addr %s - "
                 "Value %d\n", this->getName(),
-                curEpisode->getEpisodeId(), ruby::printAddress(address),
+                curEpisode->getEpisodeId(), printAddress(address),
                 new_value);
 
         auto req = std::make_shared<Request>(address, sizeof(Value),
@@ -211,7 +211,7 @@ DmaThread::hitCallback(PacketPtr pkt)
 
     DPRINTF(ProtocolTest, "%s Episode %d: hitCallback - Command %s -"
             " Addr %s\n", this->getName(), curEpisode->getEpisodeId(),
-            resp_cmd.toString(), ruby::printAddress(addr));
+            resp_cmd.toString(), printAddress(addr));
 
     if (resp_cmd == MemCmd::SwapResp) {
         // response to a pending atomic
diff --git a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
index ae4078ee6c..516e77ddae 100644
--- a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
+++ b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
@@ -67,7 +67,7 @@ GpuWavefront::issueLoadOps()
             Addr address = addrManager->getAddress(location);
             DPRINTF(ProtocolTest, "%s Episode %d: Issuing Load - Addr %s\n",
                     this->getName(), curEpisode->getEpisodeId(),
-                    ruby::printAddress(address));
+                    printAddress(address));
 
             int load_size = sizeof(Value);
 
@@ -124,7 +124,7 @@ GpuWavefront::issueStoreOps()
 
             DPRINTF(ProtocolTest, "%s Episode %d: Issuing Store - Addr %s - "
                     "Value %d\n", this->getName(),
-                    curEpisode->getEpisodeId(), ruby::printAddress(address),
+                    curEpisode->getEpisodeId(), printAddress(address),
                     new_value);
 
             auto req = std::make_shared<Request>(address, sizeof(Value),
@@ -178,7 +178,7 @@ GpuWavefront::issueAtomicOps()
 
         DPRINTF(ProtocolTest, "%s Episode %d: Issuing Atomic_Inc - Addr %s\n",
                 this->getName(), curEpisode->getEpisodeId(),
-                ruby::printAddress(address));
+                printAddress(address));
 
         // must be aligned with store size
         assert(address % sizeof(Value) == 0);
@@ -268,7 +268,7 @@ GpuWavefront::hitCallback(PacketPtr pkt)
     DPRINTF(ProtocolTest, "%s Episode %d: hitCallback - Command %s - "
                     "Addr %s\n", this->getName(),
                     curEpisode->getEpisodeId(), resp_cmd.toString(),
-                    ruby::printAddress(addr));
+                    printAddress(addr));
 
     // whether the transaction is done after this hitCallback
     bool isTransactionDone = true;
diff --git a/src/cpu/testers/gpu_ruby_test/tester_thread.cc b/src/cpu/testers/gpu_ruby_test/tester_thread.cc
index ce3a1bccc6..dbcfba8c3c 100644
--- a/src/cpu/testers/gpu_ruby_test/tester_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/tester_thread.cc
@@ -43,6 +43,7 @@ TesterThread::TesterThread(const Params &p)
       : ClockedObject(p),
         threadEvent(this, "TesterThread tick"),
         deadlockCheckEvent(this),
+        cacheLineSize(p.cache_line_size),
         threadId(p.thread_id),
         numLanes(p.num_lanes),
         tester(nullptr), addrManager(nullptr), port(nullptr),
@@ -383,7 +384,7 @@ TesterThread::validateAtomicResp(Location loc, int lane, Value ret_val)
         ss << threadName << ": Atomic Op returned unexpected value\n"
            << "\tEpisode " << curEpisode->getEpisodeId() << "\n"
            << "\tLane ID " << lane << "\n"
-           << "\tAddress " << ruby::printAddress(addr) << "\n"
+           << "\tAddress " << printAddress(addr) << "\n"
            << "\tAtomic Op's return value " << ret_val << "\n";
 
         // print out basic info
@@ -409,7 +410,7 @@ TesterThread::validateLoadResp(Location loc, int lane, Value ret_val)
            << "\tTesterThread " << threadId << "\n"
            << "\tEpisode " << curEpisode->getEpisodeId() << "\n"
            << "\tLane ID " << lane << "\n"
-           << "\tAddress " << ruby::printAddress(addr) << "\n"
+           << "\tAddress " << printAddress(addr) << "\n"
            << "\tLoaded value " << ret_val << "\n"
            << "\tLast writer " << addrManager->printLastWriter(loc) << "\n";
 
@@ -467,7 +468,7 @@ TesterThread::printOutstandingReqs(const OutstandingReqTable& table,
 
     for (const auto& m : table) {
         for (const auto& req : m.second) {
-            ss << "\t\t\tAddr " << ruby::printAddress(m.first)
+            ss << "\t\t\tAddr " << printAddress(m.first)
                << ": delta (curCycle - issueCycle) = "
                << (cur_cycle - req.issueCycle) << std::endl;
         }
@@ -488,4 +489,10 @@ TesterThread::printAllOutstandingReqs(std::stringstream& ss) const
        << pendingFenceCount << std::endl;
 }
 
+std::string
+TesterThread::printAddress(Addr addr) const
+{
+    return ruby::printAddress(addr, cacheLineSize * 8);
+}
+
 } // namespace gem5
diff --git a/src/cpu/testers/gpu_ruby_test/tester_thread.hh b/src/cpu/testers/gpu_ruby_test/tester_thread.hh
index 9877d63c24..f31a5a3dea 100644
--- a/src/cpu/testers/gpu_ruby_test/tester_thread.hh
+++ b/src/cpu/testers/gpu_ruby_test/tester_thread.hh
@@ -132,6 +132,7 @@ class TesterThread : public ClockedObject
         {}
     };
 
+    int cacheLineSize;
     // the unique global id of this thread
     int threadId;
     // width of this thread (1 for cpu thread & wf size for gpu wavefront)
@@ -204,6 +205,7 @@ class TesterThread : public ClockedObject
 
     void printOutstandingReqs(const OutstandingReqTable& table,
                               std::stringstream& ss) const;
+    std::string printAddress(Addr addr) const;
 };
 
 } // namespace gem5
diff --git a/src/cpu/testers/rubytest/Check.cc b/src/cpu/testers/rubytest/Check.cc
index 5a83d9ca27..b9c777526a 100644
--- a/src/cpu/testers/rubytest/Check.cc
+++ b/src/cpu/testers/rubytest/Check.cc
@@ -124,7 +124,8 @@ Check::initiatePrefetch()
 
     // push the subblock onto the sender state.  The sequencer will
     // update the subblock on the return
-    pkt->senderState = new SenderState(m_address, req->getSize());
+    pkt->senderState = new SenderState(m_address, req->getSize(),
+                                       CACHE_LINE_BITS);
 
     if (port->sendTimingReq(pkt)) {
         DPRINTF(RubyTest, "successfully initiated prefetch.\n");
@@ -161,7 +162,8 @@ Check::initiateFlush()
 
     // push the subblock onto the sender state.  The sequencer will
     // update the subblock on the return
-    pkt->senderState = new SenderState(m_address, req->getSize());
+    pkt->senderState = new SenderState(m_address, req->getSize(),
+                                       CACHE_LINE_BITS);
 
     if (port->sendTimingReq(pkt)) {
         DPRINTF(RubyTest, "initiating Flush - successful\n");
@@ -207,7 +209,8 @@ Check::initiateAction()
 
     // push the subblock onto the sender state.  The sequencer will
     // update the subblock on the return
-    pkt->senderState = new SenderState(writeAddr, req->getSize());
+    pkt->senderState = new SenderState(m_address, req->getSize(),
+                                       CACHE_LINE_BITS);
 
     if (port->sendTimingReq(pkt)) {
         DPRINTF(RubyTest, "initiating action - successful\n");
@@ -261,7 +264,8 @@ Check::initiateCheck()
 
     // push the subblock onto the sender state.  The sequencer will
     // update the subblock on the return
-    pkt->senderState = new SenderState(m_address, req->getSize());
+    pkt->senderState = new SenderState(m_address, req->getSize(),
+                                       CACHE_LINE_BITS);
 
     if (port->sendTimingReq(pkt)) {
         DPRINTF(RubyTest, "initiating check - successful\n");
@@ -291,7 +295,9 @@ Check::performCallback(ruby::NodeID proc, ruby::SubBlock* data, Cycles curTime)
     // This isn't exactly right since we now have multi-byte checks
     //  assert(getAddress() == address);
 
-    assert(ruby::makeLineAddress(m_address) == ruby::makeLineAddress(address));
+    int block_size_bits = CACHE_LINE_BITS;
+    assert(ruby::makeLineAddress(m_address, block_size_bits) ==
+           ruby::makeLineAddress(address, block_size_bits));
     assert(data != NULL);
 
     DPRINTF(RubyTest, "RubyTester Callback\n");
@@ -342,7 +348,7 @@ Check::performCallback(ruby::NodeID proc, ruby::SubBlock* data, Cycles curTime)
     }
 
     DPRINTF(RubyTest, "proc: %d, Address: 0x%x\n", proc,
-            ruby::makeLineAddress(m_address));
+            ruby::makeLineAddress(m_address, block_size_bits));
     DPRINTF(RubyTest, "Callback done\n");
     debugPrint();
 }
diff --git a/src/cpu/testers/rubytest/Check.hh b/src/cpu/testers/rubytest/Check.hh
index 78e2bda77e..0270b800d7 100644
--- a/src/cpu/testers/rubytest/Check.hh
+++ b/src/cpu/testers/rubytest/Check.hh
@@ -47,6 +47,7 @@ class SubBlock;
 
 const int CHECK_SIZE_BITS = 2;
 const int CHECK_SIZE = (1 << CHECK_SIZE_BITS);
+const int CACHE_LINE_BITS = 6;
 
 class Check
 {
diff --git a/src/cpu/testers/rubytest/RubyTester.hh b/src/cpu/testers/rubytest/RubyTester.hh
index 9397126180..d306c405ef 100644
--- a/src/cpu/testers/rubytest/RubyTester.hh
+++ b/src/cpu/testers/rubytest/RubyTester.hh
@@ -90,7 +90,9 @@ class RubyTester : public ClockedObject
     {
         ruby::SubBlock subBlock;
 
-        SenderState(Addr addr, int size) : subBlock(addr, size) {}
+        SenderState(Addr addr, int size, int cl_size)
+            : subBlock(addr, size, cl_size)
+        {}
 
     };
 
diff --git a/src/dev/amdgpu/AMDGPU.py b/src/dev/amdgpu/AMDGPU.py
index 35ffcfe528..b1f597aba8 100644
--- a/src/dev/amdgpu/AMDGPU.py
+++ b/src/dev/amdgpu/AMDGPU.py
@@ -81,8 +81,6 @@ class AMDGPUDevice(PciDevice):
     InterruptPin = 2
     ExpansionROM = 0
 
-    rom_binary = Param.String("ROM binary dumped from hardware")
-    trace_file = Param.String("MMIO trace collected on hardware")
     checkpoint_before_mmios = Param.Bool(
         False, "Take a checkpoint before the device begins sending MMIOs"
     )
diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index c82d0de60c..50d152cda1 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -58,12 +58,6 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
       init_interrupt_count(0), _lastVMID(0),
       deviceMem(name() + ".deviceMem", p.memories, false, "", false)
 {
-    // Loading the rom binary dumped from hardware.
-    std::ifstream romBin;
-    romBin.open(p.rom_binary, std::ios::binary);
-    romBin.read((char *)rom.data(), ROM_SIZE);
-    romBin.close();
-
     // System pointer needs to be explicitly set for device memory since
     // DRAMCtrl uses it to get (1) cache line size and (2) the mem mode.
     // Note this means the cache line size is system wide.
@@ -92,10 +86,6 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
         panic("Unknown GPU device %s\n", p.device_name);
     }
 
-    if (p.trace_file != "") {
-        mmioReader.readMMIOTrace(p.trace_file);
-    }
-
     int sdma_id = 0;
     for (auto& s : p.sdmas) {
         s->setGPUDevice(this);
diff --git a/src/dev/pci/host.cc b/src/dev/pci/host.cc
index e7dea6c359..80cd9b5a5d 100644
--- a/src/dev/pci/host.cc
+++ b/src/dev/pci/host.cc
@@ -168,9 +168,14 @@ GenericPciHost::write(PacketPtr pkt)
             pkt->getSize());
 
     PciDevice *const pci_dev(getDevice(dev_addr.first));
-    panic_if(!pci_dev,
-             "%02x:%02x.%i: Write to config space on non-existent PCI device\n",
-             dev_addr.first.bus, dev_addr.first.dev, dev_addr.first.func);
+    warn_if(!pci_dev,
+            "%02x:%02x.%i: Write to config space on non-existent PCI device\n",
+            dev_addr.first.bus, dev_addr.first.dev, dev_addr.first.func);
+
+    if (!pci_dev) {
+        pkt->makeAtomicResponse();
+        return 20000; // 20ns default from PciDevice.py
+    }
 
     // @todo Remove this after testing
     pkt->headerDelay = pkt->payloadDelay = 0;
diff --git a/src/dev/riscv/clint.cc b/src/dev/riscv/clint.cc
index fc959aced4..a18555fc87 100644
--- a/src/dev/riscv/clint.cc
+++ b/src/dev/riscv/clint.cc
@@ -53,7 +53,7 @@ Clint::Clint(const Params &params) :
     BasicPioDevice(params, params.pio_size),
     system(params.system),
     nThread(params.num_threads),
-    signal(params.name + ".signal", 0, this),
+    signal(params.name + ".signal", 0, this, INT_RTC),
     reset(params.name + ".reset"),
     resetMtimecmp(params.reset_mtimecmp),
     registers(params.name + ".registers", params.pio_addr, this,
@@ -69,9 +69,11 @@ Clint::Clint(const Params &params) :
 void
 Clint::raiseInterruptPin(int id)
 {
-    // Increment mtime
+    // Increment mtime when received RTC signal
     uint64_t& mtime = registers.mtime.get();
-    mtime++;
+    if (id == INT_RTC) {
+        mtime++;
+    }
 
     for (int context_id = 0; context_id < nThread; context_id++) {
 
@@ -261,7 +263,7 @@ Clint::doReset() {
         registers.msip[i].reset();
     }
     // We need to update the mtip interrupt bits when reset
-    raiseInterruptPin(0);
+    raiseInterruptPin(INT_RESET);
 }
 
 } // namespace gem5
diff --git a/src/dev/riscv/clint.hh b/src/dev/riscv/clint.hh
index 38f2117a16..2478eee0db 100644
--- a/src/dev/riscv/clint.hh
+++ b/src/dev/riscv/clint.hh
@@ -91,6 +91,13 @@ class Clint : public BasicPioDevice
     void raiseInterruptPin(int id);
     void lowerInterruptPin(int id) {}
 
+  // Interrupt ID
+  enum InterruptId
+  {
+      INT_RTC = 0, // received from RTC(signal port)
+      INT_RESET, // received from reset port
+  };
+
   // Register bank
   public:
 
diff --git a/src/dev/virtio/base.hh b/src/dev/virtio/base.hh
index 41ebb741d1..c31cd298b9 100644
--- a/src/dev/virtio/base.hh
+++ b/src/dev/virtio/base.hh
@@ -477,7 +477,7 @@ class VirtQueue : public Serializable
             Index index;
         };
 
-        VirtRing<T>(PortProxy &proxy, ByteOrder bo, uint16_t size) :
+        VirtRing(PortProxy &proxy, ByteOrder bo, uint16_t size) :
             header{0, 0}, ring(size), _proxy(proxy), _base(0), byteOrder(bo)
         {}
 
@@ -550,7 +550,7 @@ class VirtQueue : public Serializable
 
       private:
         // Remove default constructor
-        VirtRing<T>();
+        VirtRing();
 
         /** Guest physical memory proxy */
         PortProxy &_proxy;
diff --git a/src/mem/cache/cache_blk.hh b/src/mem/cache/cache_blk.hh
index 2b24828259..a2027f25f1 100644
--- a/src/mem/cache/cache_blk.hh
+++ b/src/mem/cache/cache_blk.hh
@@ -461,7 +461,7 @@ class CacheBlk : public TaggedEntry
 
   protected:
     /** The current coherence status of this block. @sa CoherenceBits */
-    unsigned coherence;
+    unsigned coherence = 0;
 
     // The following setters have been marked as protected because their
     // respective variables should only be modified at 2 moments:
diff --git a/src/mem/cache/prefetch/Prefetcher.py b/src/mem/cache/prefetch/Prefetcher.py
index 9864c922f6..85cc628d5b 100644
--- a/src/mem/cache/prefetch/Prefetcher.py
+++ b/src/mem/cache/prefetch/Prefetcher.py
@@ -599,6 +599,22 @@ class BOPPrefetcher(QueuedPrefetcher):
     on_inst = False
 
 
+class SmsPrefetcher(QueuedPrefetcher):
+    # Paper: https://web.eecs.umich.edu/~twenisch/papers/isca06.pdf
+    type = "SmsPrefetcher"
+    cxx_class = "gem5::prefetch::Sms"
+    cxx_header = "mem/cache/prefetch/sms.hh"
+    ft_size = Param.Unsigned(64, "Size of Filter and Active generation table")
+    pht_size = Param.Unsigned(16384, "Size of pattern history table")
+    region_size = Param.Unsigned(4096, "Spatial region size")
+
+    queue_squash = True
+    queue_filter = True
+    cache_snoop = True
+    prefetch_on_access = True
+    on_inst = False
+
+
 class SBOOEPrefetcher(QueuedPrefetcher):
     type = "SBOOEPrefetcher"
     cxx_class = "gem5::prefetch::SBOOE"
diff --git a/src/mem/cache/prefetch/SConscript b/src/mem/cache/prefetch/SConscript
index 8ce15e9688..c971b7541b 100644
--- a/src/mem/cache/prefetch/SConscript
+++ b/src/mem/cache/prefetch/SConscript
@@ -31,8 +31,9 @@ Import('*')
 SimObject('Prefetcher.py', sim_objects=[
     'BasePrefetcher', 'MultiPrefetcher', 'QueuedPrefetcher',
     'StridePrefetcherHashedSetAssociative', 'StridePrefetcher',
-    'TaggedPrefetcher', 'IndirectMemoryPrefetcher', 'SignaturePathPrefetcher',
-    'SignaturePathPrefetcherV2', 'AccessMapPatternMatching', 'AMPMPrefetcher',
+    'SmsPrefetcher', 'TaggedPrefetcher', 'IndirectMemoryPrefetcher',
+    'SignaturePathPrefetcher', 'SignaturePathPrefetcherV2',
+    'AccessMapPatternMatching', 'AMPMPrefetcher',
     'DeltaCorrelatingPredictionTables', 'DCPTPrefetcher',
     'IrregularStreamBufferPrefetcher', 'SlimAMPMPrefetcher',
     'BOPPrefetcher', 'SBOOEPrefetcher', 'STeMSPrefetcher', 'PIFPrefetcher'])
@@ -47,6 +48,7 @@ Source('indirect_memory.cc')
 Source('pif.cc')
 Source('queued.cc')
 Source('sbooe.cc')
+Source('sms.cc')
 Source('signature_path.cc')
 Source('signature_path_v2.cc')
 Source('slim_ampm.cc')
diff --git a/src/mem/cache/prefetch/sms.cc b/src/mem/cache/prefetch/sms.cc
new file mode 100644
index 0000000000..2ad4ef92e3
--- /dev/null
+++ b/src/mem/cache/prefetch/sms.cc
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Describes a SMS prefetcher based on template policies.
+ */
+
+#include "mem/cache/prefetch/sms.hh"
+
+#include "debug/HWPrefetch.hh"
+#include "params/SmsPrefetcher.hh"
+
+namespace gem5
+{
+
+namespace prefetch
+{
+
+Sms::Sms(const SmsPrefetcherParams &p)
+    : Queued(p), Max_Contexts(p.ft_size), MAX_PHTSize(p.pht_size),
+      Region_Size(p.region_size)
+{
+    AGT.clear();
+    AGTPC.clear();
+    FT.clear();
+    PHT.clear();
+    fifoFT.clear();
+    lruAGT.clear();
+    lruPHT.clear();
+
+}
+void
+Sms::notifyEvict(const EvictionInfo &info)
+{
+    //Check if any active generation has ended
+    Addr region_base = roundDown(info.addr, Region_Size);
+    std::pair <Addr,Addr> pc_offset = AGTPC[region_base];
+    if (AGT.find(region_base) != AGT.end()) {
+        //remove old recording
+        if (PHT.find(pc_offset) != PHT.end()) {
+            PHT[pc_offset].clear();
+        }
+        //Move from AGT to PHT
+        for (std::set<Addr>::iterator it = AGT[region_base].begin();
+         it != AGT[region_base].end(); it ++) {
+            PHT[pc_offset].insert(*it);
+        }
+        lruPHT.push_front(pc_offset);
+    }
+
+    while (PHT.size() > MAX_PHTSize) {
+        PHT.erase(lruPHT.back());
+        lruPHT.pop_back();
+    }
+
+    AGTPC.erase(region_base);
+    AGT.erase(region_base);
+}
+void
+Sms::calculatePrefetch(const PrefetchInfo &pfi,
+    std::vector<AddrPriority> &addresses,
+    const CacheAccessor &cache)
+{
+
+    if (!pfi.hasPC()) {
+        DPRINTF(HWPrefetch, "Ignoring request with no PC.\n");
+        return;
+    }
+
+    Addr blk_addr = blockAddress(pfi.getAddr());
+    Addr pc = pfi.getPC();
+    Addr region_base = roundDown(blk_addr, Region_Size);
+    Addr offset = blk_addr - region_base;
+
+    //Training
+    if (AGT.find(region_base) != AGT.end()) {
+        assert (FT.find(region_base) == FT.end());
+        // Record Pattern
+        AGT[region_base].insert(offset);
+        //update LRU
+        for (std::deque <Addr>::iterator lit = lruAGT.begin();
+         lit != lruAGT.end(); lit ++) {
+            if ((*lit) == region_base) {
+                lruAGT.erase(lit);
+                lruAGT.push_front(region_base);
+                break;
+            }
+        }
+    } else if (FT.find(region_base) != FT.end()) {
+        //move entry from FT to AGT
+        AGT[region_base].insert(FT[region_base].second);
+        AGTPC[region_base] = FT[region_base];
+        lruAGT.push_front(region_base);
+        //Record latest offset
+        AGT[region_base].insert(offset);
+        //Recycle FT entry
+        FT.erase(region_base);
+        //Make space for next entry
+        while (AGT.size() > Max_Contexts) {
+            AGT.erase(lruAGT.back());
+            AGTPC.erase(lruAGT.back());
+            lruAGT.pop_back();
+        }
+    } else {
+        // Trigger Access
+        FT[region_base] = std::make_pair (pc,offset);
+        fifoFT.push_front(region_base);
+        while (FT.size() > Max_Contexts) {
+            FT.erase(fifoFT.back());
+            fifoFT.pop_back();
+        }
+    }
+
+    //Prediction
+    std::pair <Addr, Addr> pc_offset = std::make_pair(pc,offset);
+    if (PHT.find(pc_offset) != PHT.end()) {
+        for (std::set<Addr>::iterator it = PHT[pc_offset].begin();
+         it != PHT[pc_offset].end(); it ++) {
+            Addr pref_addr = blockAddress(region_base + (*it));
+            addresses.push_back(AddrPriority(pref_addr,0));
+        }
+        for (std::deque < std::pair <Addr,Addr> >::iterator lit
+         = lruPHT.begin(); lit != lruPHT.end(); lit ++) {
+            if ((*lit) == pc_offset) {
+                    lruPHT.erase(lit);
+                    lruPHT.push_front(pc_offset);
+                    break;
+            }
+        }
+    }
+
+}
+
+} // namespace prefetch
+} // namespace gem5
diff --git a/src/mem/cache/prefetch/sms.hh b/src/mem/cache/prefetch/sms.hh
new file mode 100644
index 0000000000..4bda1694dd
--- /dev/null
+++ b/src/mem/cache/prefetch/sms.hh
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Describes a SMS prefetcher.
+ */
+
+#ifndef __MEM_CACHE_PREFETCH_SMS_HH__
+#define __MEM_CACHE_PREFETCH_SMS_HH__
+
+#include <set>
+
+#include "mem/cache/prefetch/queued.hh"
+#include "mem/packet.hh"
+
+namespace gem5
+{
+
+struct SmsPrefetcherParams;
+
+namespace prefetch
+{
+
+
+class Sms : public Queued
+{
+
+  private:
+    const int Max_Contexts; //= 64;
+    const uint64_t MAX_PHTSize; //= 512;
+    const Addr Region_Size; //= 4096;
+
+    std::map< Addr, std::set<Addr> > AGT;
+    std::map< Addr, std::pair<Addr,Addr> > AGTPC;
+    std::map< Addr, std::pair<Addr,Addr> > FT;
+    std::map< std::pair <Addr,Addr> , std::set<Addr> > PHT;
+    std::deque<Addr> fifoFT;
+    std::deque<Addr> lruAGT;
+    std::deque< std::pair <Addr,Addr> > lruPHT;
+
+    using EvictionInfo = CacheDataUpdateProbeArg;
+    void notifyEvict(const EvictionInfo &info) override;
+
+  public:
+    Sms(const SmsPrefetcherParams &p);
+    ~Sms() = default;
+
+    void calculatePrefetch(const PrefetchInfo &pfi,
+                           std::vector<AddrPriority> &addresses,
+                           const CacheAccessor &cache) override;
+};
+
+} // namespace prefetch
+} // namespace gem5
+
+#endif // __MEM_CACHE_PREFETCH_SMS_HH__
diff --git a/src/mem/ruby/common/Address.cc b/src/mem/ruby/common/Address.cc
index fcf291af51..8b120324c7 100644
--- a/src/mem/ruby/common/Address.cc
+++ b/src/mem/ruby/common/Address.cc
@@ -51,37 +51,33 @@ maskLowOrderBits(Addr addr, unsigned int number)
 }
 
 Addr
-getOffset(Addr addr)
+getOffset(Addr addr, int cacheLineBits)
 {
-    return bitSelect(addr, 0, RubySystem::getBlockSizeBits() - 1);
-}
-
-Addr
-makeLineAddress(Addr addr)
-{
-    return mbits<Addr>(addr, 63, RubySystem::getBlockSizeBits());
+    assert(cacheLineBits < 64);
+    return bitSelect(addr, 0, cacheLineBits - 1);
 }
 
 Addr
 makeLineAddress(Addr addr, int cacheLineBits)
 {
+    assert(cacheLineBits < 64);
     return maskLowOrderBits(addr, cacheLineBits);
 }
 
 // returns the next stride address based on line address
 Addr
-makeNextStrideAddress(Addr addr, int stride)
+makeNextStrideAddress(Addr addr, int stride, int cacheLineBytes)
 {
-    return makeLineAddress(addr) +
-        static_cast<int>(RubySystem::getBlockSizeBytes()) * stride;
+    return makeLineAddress(addr, floorLog2(cacheLineBytes))
+           + cacheLineBytes * stride;
 }
 
 std::string
-printAddress(Addr addr)
+printAddress(Addr addr, int cacheLineBits)
 {
     std::stringstream out;
     out << "[" << std::hex << "0x" << addr << "," << " line 0x"
-       << makeLineAddress(addr) << std::dec << "]";
+       << makeLineAddress(addr, cacheLineBits) << std::dec << "]";
     return out.str();
 }
 
diff --git a/src/mem/ruby/common/Address.hh b/src/mem/ruby/common/Address.hh
index 565c3c1fb7..51e0b5417a 100644
--- a/src/mem/ruby/common/Address.hh
+++ b/src/mem/ruby/common/Address.hh
@@ -33,6 +33,7 @@
 #include <iomanip>
 #include <iostream>
 
+#include "base/intmath.hh"
 #include "base/types.hh"
 
 namespace gem5
@@ -44,11 +45,10 @@ namespace ruby
 // selects bits inclusive
 Addr bitSelect(Addr addr, unsigned int small, unsigned int big);
 Addr maskLowOrderBits(Addr addr, unsigned int number);
-Addr getOffset(Addr addr);
-Addr makeLineAddress(Addr addr);
+Addr getOffset(Addr addr, int cacheLineBits);
 Addr makeLineAddress(Addr addr, int cacheLineBits);
-Addr makeNextStrideAddress(Addr addr, int stride);
-std::string printAddress(Addr addr);
+Addr makeNextStrideAddress(Addr addr, int stride, int cacheLineBytes);
+std::string printAddress(Addr addr, int cacheLineBits);
 
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/common/DataBlock.cc b/src/mem/ruby/common/DataBlock.cc
index 8f47d0026b..bbc0fd21c8 100644
--- a/src/mem/ruby/common/DataBlock.cc
+++ b/src/mem/ruby/common/DataBlock.cc
@@ -40,8 +40,8 @@
 
 #include "mem/ruby/common/DataBlock.hh"
 
+#include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/WriteMask.hh"
-#include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
 {
@@ -51,17 +51,22 @@ namespace ruby
 
 DataBlock::DataBlock(const DataBlock &cp)
 {
+    assert(cp.isAlloc());
+    assert(cp.getBlockSize() > 0);
+    assert(!m_alloc);
+
     uint8_t *block_update;
-    size_t block_bytes = RubySystem::getBlockSizeBytes();
-    m_data = new uint8_t[block_bytes];
-    memcpy(m_data, cp.m_data, block_bytes);
+    m_block_size = cp.getBlockSize();
+    m_data = new uint8_t[m_block_size];
+    memcpy(m_data, cp.m_data, m_block_size);
     m_alloc = true;
+    m_block_size = m_block_size;
     // If this data block is involved in an atomic operation, the effect
     // of applying the atomic operations on the data block are recorded in
     // m_atomicLog. If so, we must copy over every entry in the change log
     for (size_t i = 0; i < cp.m_atomicLog.size(); i++) {
-        block_update = new uint8_t[block_bytes];
-        memcpy(block_update, cp.m_atomicLog[i], block_bytes);
+        block_update = new uint8_t[m_block_size];
+        memcpy(block_update, cp.m_atomicLog[i], m_block_size);
         m_atomicLog.push_back(block_update);
     }
 }
@@ -69,21 +74,44 @@ DataBlock::DataBlock(const DataBlock &cp)
 void
 DataBlock::alloc()
 {
-    m_data = new uint8_t[RubySystem::getBlockSizeBytes()];
+    assert(!m_alloc);
+
+    if (!m_block_size) {
+        return;
+    }
+
+    m_data = new uint8_t[m_block_size];
     m_alloc = true;
     clear();
 }
 
+void
+DataBlock::realloc(int blk_size)
+{
+    m_block_size = blk_size;
+    assert(m_block_size > 0);
+
+    if (m_alloc) {
+        delete [] m_data;
+        m_alloc = false;
+    }
+    alloc();
+}
+
 void
 DataBlock::clear()
 {
-    memset(m_data, 0, RubySystem::getBlockSizeBytes());
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    memset(m_data, 0, m_block_size);
 }
 
 bool
 DataBlock::equal(const DataBlock& obj) const
 {
-    size_t block_bytes = RubySystem::getBlockSizeBytes();
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    size_t block_bytes = m_block_size;
     // Check that the block contents match
     if (memcmp(m_data, obj.m_data, block_bytes)) {
         return false;
@@ -102,7 +130,9 @@ DataBlock::equal(const DataBlock& obj) const
 void
 DataBlock::copyPartial(const DataBlock &dblk, const WriteMask &mask)
 {
-    for (int i = 0; i < RubySystem::getBlockSizeBytes(); i++) {
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    for (int i = 0; i < m_block_size; i++) {
         if (mask.getMask(i, 1)) {
             m_data[i] = dblk.m_data[i];
         }
@@ -113,7 +143,9 @@ void
 DataBlock::atomicPartial(const DataBlock &dblk, const WriteMask &mask,
         bool isAtomicNoReturn)
 {
-    for (int i = 0; i < RubySystem::getBlockSizeBytes(); i++) {
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    for (int i = 0; i < m_block_size; i++) {
         m_data[i] = dblk.m_data[i];
     }
     mask.performAtomic(m_data, m_atomicLog, isAtomicNoReturn);
@@ -122,7 +154,9 @@ DataBlock::atomicPartial(const DataBlock &dblk, const WriteMask &mask,
 void
 DataBlock::print(std::ostream& out) const
 {
-    int size = RubySystem::getBlockSizeBytes();
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    int size = m_block_size;
     out << "[ ";
     for (int i = 0; i < size; i++) {
         out << std::setw(2) << std::setfill('0') << std::hex
@@ -147,6 +181,7 @@ DataBlock::popAtomicLogEntryFront()
 void
 DataBlock::clearAtomicLogEntries()
 {
+    assert(m_alloc);
     for (auto log : m_atomicLog) {
         delete [] log;
     }
@@ -156,35 +191,59 @@ DataBlock::clearAtomicLogEntries()
 const uint8_t*
 DataBlock::getData(int offset, int len) const
 {
-    assert(offset + len <= RubySystem::getBlockSizeBytes());
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    assert(offset + len <= m_block_size);
     return &m_data[offset];
 }
 
 uint8_t*
 DataBlock::getDataMod(int offset)
 {
+    assert(m_alloc);
     return &m_data[offset];
 }
 
 void
 DataBlock::setData(const uint8_t *data, int offset, int len)
 {
+    assert(m_alloc);
     memcpy(&m_data[offset], data, len);
 }
 
 void
 DataBlock::setData(PacketPtr pkt)
 {
-    int offset = getOffset(pkt->getAddr());
-    assert(offset + pkt->getSize() <= RubySystem::getBlockSizeBytes());
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    int offset = getOffset(pkt->getAddr(), floorLog2(m_block_size));
+    assert(offset + pkt->getSize() <= m_block_size);
     pkt->writeData(&m_data[offset]);
 }
 
 DataBlock &
 DataBlock::operator=(const DataBlock & obj)
 {
+    // Reallocate if needed
+    if (m_alloc && m_block_size != obj.getBlockSize()) {
+        delete [] m_data;
+        m_block_size = obj.getBlockSize();
+        alloc();
+    } else if (!m_alloc) {
+        m_block_size = obj.getBlockSize();
+        alloc();
+
+        // Assume this will be realloc'd later if zero.
+        if (m_block_size == 0) {
+            return *this;
+        }
+    } else {
+        assert(m_alloc && m_block_size == obj.getBlockSize());
+    }
+    assert(m_block_size > 0);
+
     uint8_t *block_update;
-    size_t block_bytes = RubySystem::getBlockSizeBytes();
+    size_t block_bytes = m_block_size;
     // Copy entire block contents from obj to current block
     memcpy(m_data, obj.m_data, block_bytes);
     // If this data block is involved in an atomic operation, the effect
diff --git a/src/mem/ruby/common/DataBlock.hh b/src/mem/ruby/common/DataBlock.hh
index 7456a25f3f..ebfa7d1383 100644
--- a/src/mem/ruby/common/DataBlock.hh
+++ b/src/mem/ruby/common/DataBlock.hh
@@ -61,8 +61,14 @@ class WriteMask;
 class DataBlock
 {
   public:
-    DataBlock()
+    // Ideally this should nost be called. We allow default so that protocols
+    // do not need to be changed.
+    DataBlock() = default;
+
+    DataBlock(int blk_size)
     {
+        assert(!m_alloc);
+        m_block_size = blk_size;
         alloc();
     }
 
@@ -101,10 +107,16 @@ class DataBlock
     bool equal(const DataBlock& obj) const;
     void print(std::ostream& out) const;
 
+    int getBlockSize() const { return m_block_size; }
+    void setBlockSize(int block_size) { realloc(block_size); }
+    bool isAlloc() const { return m_alloc; }
+    void realloc(int blk_size);
+
   private:
     void alloc();
-    uint8_t *m_data;
-    bool m_alloc;
+    uint8_t *m_data = nullptr;
+    bool m_alloc = false;
+    int m_block_size = 0;
 
     // Tracks block changes when atomic ops are applied
     std::deque<uint8_t*> m_atomicLog;
@@ -124,18 +136,21 @@ DataBlock::assign(uint8_t *data)
 inline uint8_t
 DataBlock::getByte(int whichByte) const
 {
+    assert(m_alloc);
     return m_data[whichByte];
 }
 
 inline void
 DataBlock::setByte(int whichByte, uint8_t data)
 {
+    assert(m_alloc);
     m_data[whichByte] = data;
 }
 
 inline void
 DataBlock::copyPartial(const DataBlock & dblk, int offset, int len)
 {
+    assert(m_alloc);
     setData(&dblk.m_data[offset], offset, len);
 }
 
diff --git a/src/mem/ruby/common/NetDest.cc b/src/mem/ruby/common/NetDest.cc
index ba64f2febd..944315b97f 100644
--- a/src/mem/ruby/common/NetDest.cc
+++ b/src/mem/ruby/common/NetDest.cc
@@ -30,6 +30,8 @@
 
 #include <algorithm>
 
+#include "mem/ruby/system/RubySystem.hh"
+
 namespace gem5
 {
 
@@ -38,12 +40,18 @@ namespace ruby
 
 NetDest::NetDest()
 {
-  resize();
+}
+
+NetDest::NetDest(RubySystem *ruby_system)
+    : m_ruby_system(ruby_system)
+{
+    resize();
 }
 
 void
 NetDest::add(MachineID newElement)
 {
+    assert(m_bits.size() > 0);
     assert(bitIndex(newElement.num) < m_bits[vecIndex(newElement)].getSize());
     m_bits[vecIndex(newElement)].add(bitIndex(newElement.num));
 }
@@ -51,6 +59,7 @@ NetDest::add(MachineID newElement)
 void
 NetDest::addNetDest(const NetDest& netDest)
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == netDest.getSize());
     for (int i = 0; i < m_bits.size(); i++) {
         m_bits[i].addSet(netDest.m_bits[i]);
@@ -60,6 +69,8 @@ NetDest::addNetDest(const NetDest& netDest)
 void
 NetDest::setNetDest(MachineType machine, const Set& set)
 {
+    assert(m_ruby_system != nullptr);
+
     // assure that there is only one set of destinations for this machine
     assert(MachineType_base_level((MachineType)(machine + 1)) -
            MachineType_base_level(machine) == 1);
@@ -69,12 +80,14 @@ NetDest::setNetDest(MachineType machine, const Set& set)
 void
 NetDest::remove(MachineID oldElement)
 {
+    assert(m_bits.size() > 0);
     m_bits[vecIndex(oldElement)].remove(bitIndex(oldElement.num));
 }
 
 void
 NetDest::removeNetDest(const NetDest& netDest)
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == netDest.getSize());
     for (int i = 0; i < m_bits.size(); i++) {
         m_bits[i].removeSet(netDest.m_bits[i]);
@@ -84,6 +97,7 @@ NetDest::removeNetDest(const NetDest& netDest)
 void
 NetDest::clear()
 {
+    assert(m_bits.size() > 0);
     for (int i = 0; i < m_bits.size(); i++) {
         m_bits[i].clear();
     }
@@ -101,6 +115,8 @@ NetDest::broadcast()
 void
 NetDest::broadcast(MachineType machineType)
 {
+    assert(m_ruby_system != nullptr);
+
     for (NodeID i = 0; i < MachineType_base_count(machineType); i++) {
         MachineID mach = {machineType, i};
         add(mach);
@@ -111,6 +127,9 @@ NetDest::broadcast(MachineType machineType)
 std::vector<NodeID>
 NetDest::getAllDest()
 {
+    assert(m_ruby_system != nullptr);
+    assert(m_bits.size() > 0);
+
     std::vector<NodeID> dest;
     dest.clear();
     for (int i = 0; i < m_bits.size(); i++) {
@@ -127,6 +146,8 @@ NetDest::getAllDest()
 int
 NetDest::count() const
 {
+    assert(m_bits.size() > 0);
+
     int counter = 0;
     for (int i = 0; i < m_bits.size(); i++) {
         counter += m_bits[i].count();
@@ -137,12 +158,14 @@ NetDest::count() const
 NodeID
 NetDest::elementAt(MachineID index)
 {
+    assert(m_bits.size() > 0);
     return m_bits[vecIndex(index)].elementAt(bitIndex(index.num));
 }
 
 MachineID
 NetDest::smallestElement() const
 {
+    assert(m_bits.size() > 0);
     assert(count() > 0);
     for (int i = 0; i < m_bits.size(); i++) {
         for (NodeID j = 0; j < m_bits[i].getSize(); j++) {
@@ -158,6 +181,9 @@ NetDest::smallestElement() const
 MachineID
 NetDest::smallestElement(MachineType machine) const
 {
+    assert(m_bits.size() > 0);
+    assert(m_ruby_system != nullptr);
+
     int size = m_bits[MachineType_base_level(machine)].getSize();
     for (NodeID j = 0; j < size; j++) {
         if (m_bits[MachineType_base_level(machine)].isElement(j)) {
@@ -173,6 +199,7 @@ NetDest::smallestElement(MachineType machine) const
 bool
 NetDest::isBroadcast() const
 {
+    assert(m_bits.size() > 0);
     for (int i = 0; i < m_bits.size(); i++) {
         if (!m_bits[i].isBroadcast()) {
             return false;
@@ -185,6 +212,7 @@ NetDest::isBroadcast() const
 bool
 NetDest::isEmpty() const
 {
+    assert(m_bits.size() > 0);
     for (int i = 0; i < m_bits.size(); i++) {
         if (!m_bits[i].isEmpty()) {
             return false;
@@ -197,8 +225,9 @@ NetDest::isEmpty() const
 NetDest
 NetDest::OR(const NetDest& orNetDest) const
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == orNetDest.getSize());
-    NetDest result;
+    NetDest result(m_ruby_system);
     for (int i = 0; i < m_bits.size(); i++) {
         result.m_bits[i] = m_bits[i].OR(orNetDest.m_bits[i]);
     }
@@ -209,8 +238,9 @@ NetDest::OR(const NetDest& orNetDest) const
 NetDest
 NetDest::AND(const NetDest& andNetDest) const
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == andNetDest.getSize());
-    NetDest result;
+    NetDest result(m_ruby_system);
     for (int i = 0; i < m_bits.size(); i++) {
         result.m_bits[i] = m_bits[i].AND(andNetDest.m_bits[i]);
     }
@@ -221,6 +251,7 @@ NetDest::AND(const NetDest& andNetDest) const
 bool
 NetDest::intersectionIsNotEmpty(const NetDest& other_netDest) const
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == other_netDest.getSize());
     for (int i = 0; i < m_bits.size(); i++) {
         if (!m_bits[i].intersectionIsEmpty(other_netDest.m_bits[i])) {
@@ -233,6 +264,7 @@ NetDest::intersectionIsNotEmpty(const NetDest& other_netDest) const
 bool
 NetDest::isSuperset(const NetDest& test) const
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == test.getSize());
 
     for (int i = 0; i < m_bits.size(); i++) {
@@ -246,12 +278,15 @@ NetDest::isSuperset(const NetDest& test) const
 bool
 NetDest::isElement(MachineID element) const
 {
+    assert(m_bits.size() > 0);
     return ((m_bits[vecIndex(element)])).isElement(bitIndex(element.num));
 }
 
 void
 NetDest::resize()
 {
+    assert(m_ruby_system != nullptr);
+
     m_bits.resize(MachineType_base_level(MachineType_NUM));
     assert(m_bits.size() == MachineType_NUM);
 
@@ -263,6 +298,7 @@ NetDest::resize()
 void
 NetDest::print(std::ostream& out) const
 {
+    assert(m_bits.size() > 0);
     out << "[NetDest (" << m_bits.size() << ") ";
 
     for (int i = 0; i < m_bits.size(); i++) {
@@ -277,6 +313,7 @@ NetDest::print(std::ostream& out) const
 bool
 NetDest::isEqual(const NetDest& n) const
 {
+    assert(m_bits.size() > 0);
     assert(m_bits.size() == n.m_bits.size());
     for (unsigned int i = 0; i < m_bits.size(); ++i) {
         if (!m_bits[i].isEqual(n.m_bits[i]))
@@ -285,5 +322,19 @@ NetDest::isEqual(const NetDest& n) const
     return true;
 }
 
+int
+NetDest::MachineType_base_count(const MachineType& obj)
+{
+    assert(m_ruby_system != nullptr);
+    return m_ruby_system->MachineType_base_count(obj);
+}
+
+int
+NetDest::MachineType_base_number(const MachineType& obj)
+{
+    assert(m_ruby_system != nullptr);
+    return m_ruby_system->MachineType_base_number(obj);
+}
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/common/NetDest.hh b/src/mem/ruby/common/NetDest.hh
index e71b876754..83f340a478 100644
--- a/src/mem/ruby/common/NetDest.hh
+++ b/src/mem/ruby/common/NetDest.hh
@@ -41,6 +41,8 @@ namespace gem5
 namespace ruby
 {
 
+class RubySystem;
+
 // NetDest specifies the network destination of a Message
 class NetDest
 {
@@ -48,6 +50,7 @@ class NetDest
     // Constructors
     // creates and empty set
     NetDest();
+    NetDest(RubySystem *ruby_system);
     explicit NetDest(int bit_size);
 
     NetDest& operator=(const Set& obj);
@@ -98,6 +101,8 @@ class NetDest
 
     void print(std::ostream& out) const;
 
+    void setRubySystem(RubySystem *rs) { m_ruby_system = rs; resize(); }
+
   private:
     // returns a value >= MachineType_base_level("this machine")
     // and < MachineType_base_level("next highest machine")
@@ -112,6 +117,12 @@ class NetDest
     NodeID bitIndex(NodeID index) const { return index; }
 
     std::vector<Set> m_bits;  // a vector of bit vectors - i.e. Sets
+
+    // Needed to call MacheinType_base_count/level
+    RubySystem *m_ruby_system = nullptr;
+
+    int MachineType_base_count(const MachineType& obj);
+    int MachineType_base_number(const MachineType& obj);
 };
 
 inline std::ostream&
diff --git a/src/mem/ruby/common/SubBlock.cc b/src/mem/ruby/common/SubBlock.cc
index 92cfd8b633..be0adc1233 100644
--- a/src/mem/ruby/common/SubBlock.cc
+++ b/src/mem/ruby/common/SubBlock.cc
@@ -38,13 +38,14 @@ namespace ruby
 
 using stl_helpers::operator<<;
 
-SubBlock::SubBlock(Addr addr, int size)
+SubBlock::SubBlock(Addr addr, int size, int cl_bits)
 {
     m_address = addr;
     resize(size);
     for (int i = 0; i < size; i++) {
         setByte(i, 0);
     }
+    m_cache_line_bits = cl_bits;
 }
 
 void
@@ -52,7 +53,7 @@ SubBlock::internalMergeFrom(const DataBlock& data)
 {
     int size = getSize();
     assert(size > 0);
-    int offset = getOffset(m_address);
+    int offset = getOffset(m_address, m_cache_line_bits);
     for (int i = 0; i < size; i++) {
         this->setByte(i, data.getByte(offset + i));
     }
@@ -63,7 +64,7 @@ SubBlock::internalMergeTo(DataBlock& data) const
 {
     int size = getSize();
     assert(size > 0);
-    int offset = getOffset(m_address);
+    int offset = getOffset(m_address, m_cache_line_bits);
     for (int i = 0; i < size; i++) {
         // This will detect crossing a cache line boundary
         data.setByte(offset + i, this->getByte(i));
diff --git a/src/mem/ruby/common/SubBlock.hh b/src/mem/ruby/common/SubBlock.hh
index e1a83600c2..3790bbac58 100644
--- a/src/mem/ruby/common/SubBlock.hh
+++ b/src/mem/ruby/common/SubBlock.hh
@@ -45,7 +45,7 @@ class SubBlock
 {
   public:
     SubBlock() { }
-    SubBlock(Addr addr, int size);
+    SubBlock(Addr addr, int size, int cl_bits);
     ~SubBlock() { }
 
     Addr getAddress() const { return m_address; }
@@ -74,6 +74,7 @@ class SubBlock
     // Data Members (m_ prefix)
     Addr m_address;
     std::vector<uint8_t> m_data;
+    int m_cache_line_bits;
 };
 
 inline std::ostream&
diff --git a/src/mem/ruby/common/WriteMask.cc b/src/mem/ruby/common/WriteMask.cc
index 1fa03c951e..f176aec9fc 100644
--- a/src/mem/ruby/common/WriteMask.cc
+++ b/src/mem/ruby/common/WriteMask.cc
@@ -39,13 +39,13 @@ namespace ruby
 {
 
 WriteMask::WriteMask()
-    : mSize(RubySystem::getBlockSizeBytes()), mMask(mSize, false),
-      mAtomic(false)
+    : mSize(0), mMask(mSize, false), mAtomic(false)
 {}
 
 void
 WriteMask::print(std::ostream& out) const
 {
+    assert(mSize > 0);
     std::string str(mSize,'0');
     for (int i = 0; i < mSize; i++) {
         str[i] = mMask[i] ? ('1') : ('0');
@@ -59,6 +59,7 @@ void
 WriteMask::performAtomic(uint8_t * p,
         std::deque<uint8_t*>& log, bool isAtomicNoReturn) const
 {
+    assert(mSize > 0);
     int offset;
     uint8_t *block_update;
     // Here, operations occur in FIFO order from the mAtomicOp
diff --git a/src/mem/ruby/common/WriteMask.hh b/src/mem/ruby/common/WriteMask.hh
index 8c6b8ce976..e620997cd8 100644
--- a/src/mem/ruby/common/WriteMask.hh
+++ b/src/mem/ruby/common/WriteMask.hh
@@ -78,6 +78,17 @@ class WriteMask
     ~WriteMask()
     {}
 
+    int getBlockSize() const { return mSize; }
+    void
+    setBlockSize(int size)
+    {
+        // This should only be used once if the default ctor was used. Probably
+        // by src/mem/ruby/protocol/RubySlicc_MemControl.sm.
+        assert(mSize == 0);
+        assert(size > 0);
+        mSize = size;
+    }
+
     void
     clear()
     {
@@ -87,6 +98,7 @@ class WriteMask
     bool
     test(int offset) const
     {
+        assert(mSize > 0);
         assert(offset < mSize);
         return mMask[offset];
     }
@@ -94,6 +106,7 @@ class WriteMask
     void
     setMask(int offset, int len, bool val = true)
     {
+        assert(mSize > 0);
         assert(mSize >= (offset + len));
         for (int i = 0; i < len; i++) {
             mMask[offset + i] = val;
@@ -102,6 +115,7 @@ class WriteMask
     void
     fillMask()
     {
+        assert(mSize > 0);
         for (int i = 0; i < mSize; i++) {
             mMask[i] = true;
         }
@@ -111,6 +125,7 @@ class WriteMask
     getMask(int offset, int len) const
     {
         bool tmp = true;
+        assert(mSize > 0);
         assert(mSize >= (offset + len));
         for (int i = 0; i < len; i++) {
             tmp = tmp & mMask.at(offset + i);
@@ -122,6 +137,7 @@ class WriteMask
     isOverlap(const WriteMask &readMask) const
     {
         bool tmp = false;
+        assert(mSize > 0);
         assert(mSize == readMask.mSize);
         for (int i = 0; i < mSize; i++) {
             if (readMask.mMask.at(i)) {
@@ -135,6 +151,7 @@ class WriteMask
     containsMask(const WriteMask &readMask) const
     {
         bool tmp = true;
+        assert(mSize > 0);
         assert(mSize == readMask.mSize);
         for (int i = 0; i < mSize; i++) {
             if (readMask.mMask.at(i)) {
@@ -146,6 +163,7 @@ class WriteMask
 
     bool isEmpty() const
     {
+        assert(mSize > 0);
         for (int i = 0; i < mSize; i++) {
             if (mMask.at(i)) {
                 return false;
@@ -157,6 +175,7 @@ class WriteMask
     bool
     isFull() const
     {
+        assert(mSize > 0);
         for (int i = 0; i < mSize; i++) {
             if (!mMask.at(i)) {
                 return false;
@@ -168,6 +187,7 @@ class WriteMask
     void
     andMask(const WriteMask & writeMask)
     {
+        assert(mSize > 0);
         assert(mSize == writeMask.mSize);
         for (int i = 0; i < mSize; i++) {
             mMask[i] = (mMask.at(i)) && (writeMask.mMask.at(i));
@@ -182,6 +202,7 @@ class WriteMask
     void
     orMask(const WriteMask & writeMask)
     {
+        assert(mSize > 0);
         assert(mSize == writeMask.mSize);
         for (int i = 0; i < mSize; i++) {
             mMask[i] = (mMask.at(i)) || (writeMask.mMask.at(i));
@@ -196,6 +217,7 @@ class WriteMask
     void
     setInvertedMask(const WriteMask & writeMask)
     {
+        assert(mSize > 0);
         assert(mSize == writeMask.mSize);
         for (int i = 0; i < mSize; i++) {
             mMask[i] = !writeMask.mMask.at(i);
@@ -205,6 +227,7 @@ class WriteMask
     int
     firstBitSet(bool val, int offset = 0) const
     {
+        assert(mSize > 0);
         for (int i = offset; i < mSize; ++i)
             if (mMask[i] == val)
                 return i;
@@ -214,6 +237,7 @@ class WriteMask
     int
     count(int offset = 0) const
     {
+        assert(mSize > 0);
         int count = 0;
         for (int i = offset; i < mSize; ++i)
             count += mMask[i];
diff --git a/src/mem/ruby/network/MessageBuffer.cc b/src/mem/ruby/network/MessageBuffer.cc
index 9a4439a538..8b3a724469 100644
--- a/src/mem/ruby/network/MessageBuffer.cc
+++ b/src/mem/ruby/network/MessageBuffer.cc
@@ -47,7 +47,6 @@
 #include "base/random.hh"
 #include "base/stl_helpers.hh"
 #include "debug/RubyQueue.hh"
-#include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
 {
@@ -216,6 +215,7 @@ random_time()
 
 void
 MessageBuffer::enqueue(MsgPtr message, Tick current_time, Tick delta,
+                       bool ruby_is_random, bool ruby_warmup,
                        bool bypassStrictFIFO)
 {
     // record current time incase we have a pop that also adjusts my size
@@ -237,7 +237,7 @@ MessageBuffer::enqueue(MsgPtr message, Tick current_time, Tick delta,
     // is turned on and this buffer allows it
     if ((m_randomization == MessageRandomization::disabled) ||
         ((m_randomization == MessageRandomization::ruby_system) &&
-          !RubySystem::getRandomization())) {
+          !ruby_is_random)) {
         // No randomization
         arrival_time = current_time + delta;
     } else {
@@ -265,7 +265,7 @@ MessageBuffer::enqueue(MsgPtr message, Tick current_time, Tick delta,
     }
 
     // If running a cache trace, don't worry about the last arrival checks
-    if (!RubySystem::getWarmupEnabled()) {
+    if (!ruby_warmup) {
         m_last_arrival_time = arrival_time;
     }
 
@@ -447,7 +447,6 @@ MessageBuffer::stallMessage(Addr addr, Tick current_time)
 {
     DPRINTF(RubyQueue, "Stalling due to %#x\n", addr);
     assert(isReady(current_time));
-    assert(getOffset(addr) == 0);
     MsgPtr message = m_prio_heap.front();
 
     // Since the message will just be moved to stall map, indicate that the
@@ -479,7 +478,8 @@ MessageBuffer::deferEnqueueingMessage(Addr addr, MsgPtr message)
 }
 
 void
-MessageBuffer::enqueueDeferredMessages(Addr addr, Tick curTime, Tick delay)
+MessageBuffer::enqueueDeferredMessages(Addr addr, Tick curTime, Tick delay,
+                                       bool ruby_is_random, bool ruby_warmup)
 {
     assert(!isDeferredMsgMapEmpty(addr));
     std::vector<MsgPtr>& msg_vec = m_deferred_msg_map[addr];
@@ -487,7 +487,7 @@ MessageBuffer::enqueueDeferredMessages(Addr addr, Tick curTime, Tick delay)
 
     // enqueue all deferred messages associated with this address
     for (MsgPtr m : msg_vec) {
-        enqueue(m, curTime, delay);
+        enqueue(m, curTime, delay, ruby_is_random, ruby_warmup);
     }
 
     msg_vec.clear();
diff --git a/src/mem/ruby/network/MessageBuffer.hh b/src/mem/ruby/network/MessageBuffer.hh
index 03a0454433..b45e531d11 100644
--- a/src/mem/ruby/network/MessageBuffer.hh
+++ b/src/mem/ruby/network/MessageBuffer.hh
@@ -90,13 +90,14 @@ class MessageBuffer : public SimObject
     Tick readyTime() const;
 
     void
-    delayHead(Tick current_time, Tick delta)
+    delayHead(Tick current_time, Tick delta, bool ruby_is_random,
+              bool ruby_warmup)
     {
         MsgPtr m = m_prio_heap.front();
         std::pop_heap(m_prio_heap.begin(), m_prio_heap.end(),
                       std::greater<MsgPtr>());
         m_prio_heap.pop_back();
-        enqueue(m, current_time, delta);
+        enqueue(m, current_time, delta, ruby_is_random, ruby_warmup);
     }
 
     bool areNSlotsAvailable(unsigned int n, Tick curTime);
@@ -124,6 +125,7 @@ class MessageBuffer : public SimObject
     const MsgPtr &peekMsgPtr() const { return m_prio_heap.front(); }
 
     void enqueue(MsgPtr message, Tick curTime, Tick delta,
+                bool ruby_is_random, bool ruby_warmup,
                 bool bypassStrictFIFO = false);
 
     // Defer enqueueing a message to a later cycle by putting it aside and not
@@ -135,7 +137,8 @@ class MessageBuffer : public SimObject
 
     // enqueue all previously deferred messages that are associated with the
     // input address
-    void enqueueDeferredMessages(Addr addr, Tick curTime, Tick delay);
+    void enqueueDeferredMessages(Addr addr, Tick curTime, Tick delay,
+                                 bool ruby_is_random, bool ruby_warmup);
     bool isDeferredMsgMapEmpty(Addr addr) const;
 
     //! Updates the delay cycles of the message at the head of the queue,
diff --git a/src/mem/ruby/network/Network.cc b/src/mem/ruby/network/Network.cc
index 757ed9498e..480b5bcef0 100644
--- a/src/mem/ruby/network/Network.cc
+++ b/src/mem/ruby/network/Network.cc
@@ -65,7 +65,8 @@ Network::Network(const Params &p)
              "%s: data message size > cache line size", name());
     m_data_msg_size = p.data_msg_size + m_control_msg_size;
 
-    params().ruby_system->registerNetwork(this);
+    m_ruby_system = p.ruby_system;
+    m_ruby_system->registerNetwork(this);
 
     // Populate localNodeVersions with the version of each MachineType in
     // this network. This will be used to compute a global to local ID.
@@ -102,7 +103,8 @@ Network::Network(const Params &p)
 
     m_topology_ptr = new Topology(m_nodes, p.routers.size(),
                                   m_virtual_networks,
-                                  p.ext_links, p.int_links);
+                                  p.ext_links, p.int_links,
+                                  m_ruby_system);
 
     // Allocate to and from queues
     // Queues that are getting messages from protocol
@@ -246,7 +248,7 @@ Network::addressToNodeID(Addr addr, MachineType mtype)
             }
         }
     }
-    return MachineType_base_count(mtype);
+    return m_ruby_system->MachineType_base_count(mtype);
 }
 
 NodeID
@@ -256,5 +258,23 @@ Network::getLocalNodeID(NodeID global_id) const
     return globalToLocalMap.at(global_id);
 }
 
+bool
+Network::getRandomization() const
+{
+    return m_ruby_system->getRandomization();
+}
+
+bool
+Network::getWarmupEnabled() const
+{
+    return m_ruby_system->getWarmupEnabled();
+}
+
+int
+Network::MachineType_base_number(const MachineType& obj)
+{
+    return m_ruby_system->MachineType_base_number(obj);
+}
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/network/Network.hh b/src/mem/ruby/network/Network.hh
index 8ca68a0279..c0d21af240 100644
--- a/src/mem/ruby/network/Network.hh
+++ b/src/mem/ruby/network/Network.hh
@@ -78,6 +78,7 @@ namespace ruby
 
 class NetDest;
 class MessageBuffer;
+class RubySystem;
 
 class Network : public ClockedObject
 {
@@ -147,6 +148,10 @@ class Network : public ClockedObject
 
     NodeID getLocalNodeID(NodeID global_id) const;
 
+    bool getRandomization() const;
+    bool getWarmupEnabled() const;
+    RubySystem *getRubySystem() const { return m_ruby_system; }
+
   protected:
     // Private copy constructor and assignment operator
     Network(const Network& obj);
@@ -176,6 +181,12 @@ class Network : public ClockedObject
     // Global NodeID to local node map. If there are not multiple networks in
     // the same RubySystem, this is a one-to-one mapping of global to local.
     std::unordered_map<NodeID, NodeID> globalToLocalMap;
+
+    // For accessing if randomization/warnup are turned on. We cannot store
+    // those values in the constructor in case we are constructed first.
+    RubySystem *m_ruby_system = nullptr;
+
+    int MachineType_base_number(const MachineType& obj);
 };
 
 inline std::ostream&
diff --git a/src/mem/ruby/network/Topology.cc b/src/mem/ruby/network/Topology.cc
index 39444c9023..b2cd7897f8 100644
--- a/src/mem/ruby/network/Topology.cc
+++ b/src/mem/ruby/network/Topology.cc
@@ -37,6 +37,7 @@
 #include "mem/ruby/network/BasicLink.hh"
 #include "mem/ruby/network/Network.hh"
 #include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
 {
@@ -56,10 +57,12 @@ const int INFINITE_LATENCY = 10000; // Yes, this is a big hack
 Topology::Topology(uint32_t num_nodes, uint32_t num_routers,
                    uint32_t num_vnets,
                    const std::vector<BasicExtLink *> &ext_links,
-                   const std::vector<BasicIntLink *> &int_links)
-    : m_nodes(MachineType_base_number(MachineType_NUM)),
+                   const std::vector<BasicIntLink *> &int_links,
+                   RubySystem *ruby_system)
+    : m_nodes(ruby_system->MachineType_base_number(MachineType_NUM)),
       m_number_of_switches(num_routers), m_vnets(num_vnets),
-      m_ext_link_vector(ext_links), m_int_link_vector(int_links)
+      m_ext_link_vector(ext_links), m_int_link_vector(int_links),
+      m_ruby_system(ruby_system)
 {
     // Total nodes/controllers in network
     assert(m_nodes > 1);
@@ -78,7 +81,8 @@ Topology::Topology(uint32_t num_nodes, uint32_t num_routers,
         AbstractController *abs_cntrl = ext_link->params().ext_node;
         BasicRouter *router = ext_link->params().int_node;
 
-        int machine_base_idx = MachineType_base_number(abs_cntrl->getType());
+        int machine_base_idx =
+            ruby_system->MachineType_base_number(abs_cntrl->getType());
         int ext_idx1 = machine_base_idx + abs_cntrl->getVersion();
         int ext_idx2 = ext_idx1 + m_nodes;
         int int_idx = router->params().router_id + 2*m_nodes;
@@ -189,7 +193,7 @@ Topology::createLinks(Network *net)
     for (int i = 0; i < topology_weights[0].size(); i++) {
         for (int j = 0; j < topology_weights[0][i].size(); j++) {
             std::vector<NetDest> routingMap;
-            routingMap.resize(m_vnets);
+            routingMap.resize(m_vnets, m_ruby_system);
 
             // Not all sources and destinations are connected
             // by direct links. We only construct the links
@@ -264,7 +268,7 @@ Topology::makeLink(Network *net, SwitchID src, SwitchID dest,
         for (int l = 0; l < links.size(); l++) {
             link_entry = links[l];
             std::vector<NetDest> linkRoute;
-            linkRoute.resize(m_vnets);
+            linkRoute.resize(m_vnets, m_ruby_system);
             BasicLink *link = link_entry.link;
             if (link->mVnets.size() == 0) {
                 net->makeExtInLink(src, dest - (2 * m_nodes), link,
@@ -287,7 +291,7 @@ Topology::makeLink(Network *net, SwitchID src, SwitchID dest,
         for (int l = 0; l < links.size(); l++) {
             link_entry = links[l];
             std::vector<NetDest> linkRoute;
-            linkRoute.resize(m_vnets);
+            linkRoute.resize(m_vnets, m_ruby_system);
             BasicLink *link = link_entry.link;
             if (link->mVnets.size() == 0) {
                 net->makeExtOutLink(src - (2 * m_nodes), node, link,
@@ -309,7 +313,7 @@ Topology::makeLink(Network *net, SwitchID src, SwitchID dest,
         for (int l = 0; l < links.size(); l++) {
             link_entry = links[l];
             std::vector<NetDest> linkRoute;
-            linkRoute.resize(m_vnets);
+            linkRoute.resize(m_vnets, m_ruby_system);
             BasicLink *link = link_entry.link;
             if (link->mVnets.size() == 0) {
                 net->makeInternalLink(src - (2 * m_nodes),
@@ -413,16 +417,17 @@ Topology::shortest_path_to_node(SwitchID src, SwitchID next,
                                 const Matrix &weights, const Matrix &dist,
                                 int vnet)
 {
-    NetDest result;
+    NetDest result(m_ruby_system);
     int d = 0;
     int machines;
     int max_machines;
 
     machines = MachineType_NUM;
-    max_machines = MachineType_base_number(MachineType_NUM);
+    max_machines = m_ruby_system->MachineType_base_number(MachineType_NUM);
 
     for (int m = 0; m < machines; m++) {
-        for (NodeID i = 0; i < MachineType_base_count((MachineType)m); i++) {
+        for (NodeID i = 0;
+            i < m_ruby_system->MachineType_base_count((MachineType)m); i++) {
             // we use "d+max_machines" below since the "destination"
             // switches for the machines are numbered
             // [MachineType_base_number(MachineType_NUM)...
diff --git a/src/mem/ruby/network/Topology.hh b/src/mem/ruby/network/Topology.hh
index 301811e6ab..7ab395762a 100644
--- a/src/mem/ruby/network/Topology.hh
+++ b/src/mem/ruby/network/Topology.hh
@@ -80,7 +80,8 @@ class Topology
   public:
     Topology(uint32_t num_nodes, uint32_t num_routers, uint32_t num_vnets,
              const std::vector<BasicExtLink *> &ext_links,
-             const std::vector<BasicIntLink *> &int_links);
+             const std::vector<BasicIntLink *> &int_links,
+             RubySystem *ruby_system);
 
     uint32_t numSwitches() const { return m_number_of_switches; }
     void createLinks(Network *net);
@@ -108,7 +109,7 @@ class Topology
                                   const Matrix &weights, const Matrix &dist,
                                   int vnet);
 
-    const uint32_t m_nodes;
+    uint32_t m_nodes;
     const uint32_t m_number_of_switches;
     int m_vnets;
 
@@ -116,6 +117,8 @@ class Topology
     std::vector<BasicIntLink*> m_int_link_vector;
 
     LinkMap m_link_map;
+
+    RubySystem *m_ruby_system = nullptr;
 };
 
 inline std::ostream&
diff --git a/src/mem/ruby/network/garnet/NetworkInterface.cc b/src/mem/ruby/network/garnet/NetworkInterface.cc
index 31d625c4d5..8564baca6d 100644
--- a/src/mem/ruby/network/garnet/NetworkInterface.cc
+++ b/src/mem/ruby/network/garnet/NetworkInterface.cc
@@ -41,6 +41,7 @@
 #include "mem/ruby/network/garnet/Credit.hh"
 #include "mem/ruby/network/garnet/flitBuffer.hh"
 #include "mem/ruby/slicc_interface/Message.hh"
+#include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
 {
@@ -244,7 +245,9 @@ NetworkInterface::wakeup()
                     outNode_ptr[vnet]->areNSlotsAvailable(1, curTime)) {
                     // Space is available. Enqueue to protocol buffer.
                     outNode_ptr[vnet]->enqueue(t_flit->get_msg_ptr(), curTime,
-                                               cyclesToTicks(Cycles(1)));
+                                               cyclesToTicks(Cycles(1)),
+                                               m_net_ptr->getRandomization(),
+                                               m_net_ptr->getWarmupEnabled());
 
                     // Simply send a credit back since we are not buffering
                     // this flit in the NI
@@ -332,7 +335,9 @@ NetworkInterface::checkStallQueue()
                 if (outNode_ptr[vnet]->areNSlotsAvailable(1,
                     curTime)) {
                     outNode_ptr[vnet]->enqueue(stallFlit->get_msg_ptr(),
-                        curTime, cyclesToTicks(Cycles(1)));
+                        curTime, cyclesToTicks(Cycles(1)),
+                        m_net_ptr->getRandomization(),
+                        m_net_ptr->getWarmupEnabled());
 
                     // Send back a credit with free signal now that the
                     // VC is no longer stalled.
@@ -699,6 +704,12 @@ NetworkInterface::functionalWrite(Packet *pkt)
     return num_functional_writes;
 }
 
+int
+NetworkInterface::MachineType_base_number(const MachineType& obj)
+{
+    return m_net_ptr->getRubySystem()->MachineType_base_number(obj);
+}
+
 } // namespace garnet
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/network/garnet/NetworkInterface.hh b/src/mem/ruby/network/garnet/NetworkInterface.hh
index d42db5ee2a..cd7bb3b171 100644
--- a/src/mem/ruby/network/garnet/NetworkInterface.hh
+++ b/src/mem/ruby/network/garnet/NetworkInterface.hh
@@ -306,6 +306,8 @@ class NetworkInterface : public ClockedObject, public Consumer
 
     InputPort *getInportForVnet(int vnet);
     OutputPort *getOutportForVnet(int vnet);
+
+    int MachineType_base_number(const MachineType& obj);
 };
 
 } // namespace garnet
diff --git a/src/mem/ruby/network/simple/PerfectSwitch.cc b/src/mem/ruby/network/simple/PerfectSwitch.cc
index 74d78e3aae..20d57f04be 100644
--- a/src/mem/ruby/network/simple/PerfectSwitch.cc
+++ b/src/mem/ruby/network/simple/PerfectSwitch.cc
@@ -268,7 +268,8 @@ PerfectSwitch::operateMessageBuffer(MessageBuffer *buffer, int vnet)
                     buffer->getIncomingLink(), vnet, outgoing, vnet);
 
             out_port.buffers[vnet]->enqueue(msg_ptr, current_time,
-                                           out_port.latency);
+                out_port.latency, m_switch->getNetPtr()->getRandomization(),
+                m_switch->getNetPtr()->getWarmupEnabled());
         }
     }
 }
diff --git a/src/mem/ruby/network/simple/Switch.hh b/src/mem/ruby/network/simple/Switch.hh
index 86abfda871..e6e22022bc 100644
--- a/src/mem/ruby/network/simple/Switch.hh
+++ b/src/mem/ruby/network/simple/Switch.hh
@@ -104,6 +104,7 @@ class Switch : public BasicRouter
 
     void print(std::ostream& out) const;
     void init_net_ptr(SimpleNetwork* net_ptr) { m_network_ptr = net_ptr; }
+    SimpleNetwork* getNetPtr() const { return m_network_ptr; }
 
     bool functionalRead(Packet *);
     bool functionalRead(Packet *, WriteMask&);
diff --git a/src/mem/ruby/network/simple/Throttle.cc b/src/mem/ruby/network/simple/Throttle.cc
index 20cebccabb..fc5649330f 100644
--- a/src/mem/ruby/network/simple/Throttle.cc
+++ b/src/mem/ruby/network/simple/Throttle.cc
@@ -199,7 +199,9 @@ Throttle::operateVnet(int vnet, int channel, int &total_bw_remaining,
             // Move the message
             in->dequeue(current_time);
             out->enqueue(msg_ptr, current_time,
-                         m_switch->cyclesToTicks(m_link_latency));
+                         m_switch->cyclesToTicks(m_link_latency),
+                         m_ruby_system->getRandomization(),
+                         m_ruby_system->getWarmupEnabled());
 
             // Count the message
             (*(throttleStats.
diff --git a/src/mem/ruby/profiler/AddressProfiler.cc b/src/mem/ruby/profiler/AddressProfiler.cc
index 05fc486c63..ce40c35a9f 100644
--- a/src/mem/ruby/profiler/AddressProfiler.cc
+++ b/src/mem/ruby/profiler/AddressProfiler.cc
@@ -34,6 +34,7 @@
 #include "base/stl_helpers.hh"
 #include "mem/ruby/profiler/Profiler.hh"
 #include "mem/ruby/protocol/RubyRequest.hh"
+#include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
 {
@@ -307,7 +308,8 @@ AddressProfiler::addTraceSample(Addr data_addr, Addr pc_addr,
         }
 
         // record data address trace info
-        data_addr = makeLineAddress(data_addr);
+        int block_size_bits = m_profiler->m_ruby_system->getBlockSizeBits();
+        data_addr = makeLineAddress(data_addr, block_size_bits);
         lookupTraceForAddress(data_addr, m_dataAccessTrace).
             update(type, access_mode, id, sharing_miss);
 
diff --git a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
index ca606a5921..43fb96c375 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
@@ -95,7 +95,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
   }
 
   TBETable TBEs, template="<SQC_TBE>", constructor="m_number_of_TBEs";
-  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int TCC_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   void set_cache_entry(AbstractCacheEntry b);
   void unset_cache_entry();
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 5d98a73041..d1e1ffb7b0 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -121,7 +121,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
   }
 
   TBETable TBEs, template="<TCP_TBE>", constructor="m_number_of_TBEs";
-  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int TCC_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
   int WTcnt, default="0";
   int Fcnt, default="0";
   bool inFlush, default="false";
diff --git a/src/mem/ruby/protocol/MESI_Three_Level-L1cache.sm b/src/mem/ruby/protocol/MESI_Three_Level-L1cache.sm
index bcf99ff362..ed5e40cfa1 100644
--- a/src/mem/ruby/protocol/MESI_Three_Level-L1cache.sm
+++ b/src/mem/ruby/protocol/MESI_Three_Level-L1cache.sm
@@ -167,7 +167,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
 
   TBETable TBEs, template="<L1Cache_TBE>", constructor="m_number_of_TBEs";
 
-  int l2_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int l2_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Cycles ticksToCycles(Tick t);
diff --git a/src/mem/ruby/protocol/MESI_Two_Level-L1cache.sm b/src/mem/ruby/protocol/MESI_Two_Level-L1cache.sm
index 2b5935dee5..29f6d8e87d 100644
--- a/src/mem/ruby/protocol/MESI_Two_Level-L1cache.sm
+++ b/src/mem/ruby/protocol/MESI_Two_Level-L1cache.sm
@@ -167,7 +167,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
 
   TBETable TBEs, template="<L1Cache_TBE>", constructor="m_number_of_TBEs";
 
-  int l2_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int l2_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Cycles ticksToCycles(Tick t);
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-RegionBuffer.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-RegionBuffer.sm
index 5d85ad2fc6..bac7fd1b12 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-RegionBuffer.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-RegionBuffer.sm
@@ -181,7 +181,7 @@ machine(MachineType:RegionBuffer, "Region Buffer for AMD_Base-like protocol")
 
   // Stores only region addresses
   TBETable TBEs, template="<RegionBuffer_TBE>", constructor="m_number_of_TBEs";
-  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int TCC_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Tick cyclesToTicks(Cycles c);
@@ -195,8 +195,8 @@ machine(MachineType:RegionBuffer, "Region Buffer for AMD_Base-like protocol")
   Cycles curCycle();
   MachineID mapAddressToMachine(Addr addr, MachineType mtype);
 
-  int blockBits,  default="RubySystem::getBlockSizeBits()";
-  int blockBytes, default="RubySystem::getBlockSizeBytes()";
+  int blockBits,  default="m_ruby_system->getBlockSizeBits()";
+  int blockBytes, default="m_ruby_system->getBlockSizeBytes()";
   int regionBits, default="log2(m_blocksPerRegion)";
 
   // Functions
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-RegionDir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-RegionDir.sm
index 2464e038ff..3f1ba2540f 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-RegionDir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-RegionDir.sm
@@ -155,7 +155,7 @@ machine(MachineType:RegionDir, "Region Directory for AMD_Base-like protocol")
 
   // Stores only region addresses
   TBETable TBEs, template="<RegionDir_TBE>", constructor="m_number_of_TBEs";
-  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int TCC_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Tick cyclesToTicks(Cycles c);
@@ -169,8 +169,8 @@ machine(MachineType:RegionDir, "Region Directory for AMD_Base-like protocol")
   Cycles curCycle();
   MachineID mapAddressToMachine(Addr addr, MachineType mtype);
 
-  int blockBits,  default="RubySystem::getBlockSizeBits()";
-  int blockBytes, default="RubySystem::getBlockSizeBytes()";
+  int blockBits,  default="m_ruby_system->getBlockSizeBits()";
+  int blockBytes, default="m_ruby_system->getBlockSizeBytes()";
   int regionBits, default="log2(m_blocksPerRegion)";
 
   // Functions
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
index 17a92f5f90..5b5ab3148a 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -183,7 +183,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
 
   TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
 
-  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int TCC_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Tick cyclesToTicks(Cycles c);
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-probeFilter.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-probeFilter.sm
index 4e9e9597aa..b53ebe8ee2 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-probeFilter.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-probeFilter.sm
@@ -192,7 +192,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
 
   TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
 
-  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int TCC_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Tick cyclesToTicks(Cycles c);
diff --git a/src/mem/ruby/protocol/MOESI_CMP_directory-dir.sm b/src/mem/ruby/protocol/MOESI_CMP_directory-dir.sm
index 4a513d6d3f..b6410d12e7 100644
--- a/src/mem/ruby/protocol/MOESI_CMP_directory-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_CMP_directory-dir.sm
@@ -143,7 +143,7 @@ machine(MachineType:Directory, "Directory protocol")
     bool isPresent(Addr);
   }
 
-  int blockSize, default="RubySystem::getBlockSizeBytes()";
+  int blockSize, default="m_ruby_system->getBlockSizeBytes()";
 
   // ** OBJECTS **
   TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
diff --git a/src/mem/ruby/protocol/MOESI_CMP_token-L1cache.sm b/src/mem/ruby/protocol/MOESI_CMP_token-L1cache.sm
index 865fce4e3c..24f8146a02 100644
--- a/src/mem/ruby/protocol/MOESI_CMP_token-L1cache.sm
+++ b/src/mem/ruby/protocol/MOESI_CMP_token-L1cache.sm
@@ -198,7 +198,7 @@ machine(MachineType:L1Cache, "Token protocol")
   TBETable L1_TBEs, template="<L1Cache_TBE>", constructor="m_number_of_TBEs";
 
   bool starving, default="false";
-  int l2_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int l2_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   PersistentTable persistentTable;
   TimerTable useTimerTable;
diff --git a/src/mem/ruby/protocol/MOESI_CMP_token-dir.sm b/src/mem/ruby/protocol/MOESI_CMP_token-dir.sm
index 7f2bdf94e0..8d035a61bb 100644
--- a/src/mem/ruby/protocol/MOESI_CMP_token-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_CMP_token-dir.sm
@@ -171,7 +171,7 @@ machine(MachineType:Directory, "Token protocol")
   TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
 
   bool starving, default="false";
-  int l2_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int l2_select_low_bit, default="m_ruby_system->getBlockSizeBits()";
 
   Tick clockEdge();
   Tick clockEdge(Cycles c);
diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm b/src/mem/ruby/protocol/RubySlicc_Exports.sm
index 8f0341f328..97770e3516 100644
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -72,6 +72,8 @@ structure(WriteMask, external="yes", desc="...") {
   int count();
   int count(int);
   bool test(int);
+  int getBlockSize();
+  void setBlockSize(int);
 }
 
 structure(DataBlock, external = "yes", desc="..."){
diff --git a/src/mem/ruby/protocol/RubySlicc_MemControl.sm b/src/mem/ruby/protocol/RubySlicc_MemControl.sm
index 012b169dea..848ada4d12 100644
--- a/src/mem/ruby/protocol/RubySlicc_MemControl.sm
+++ b/src/mem/ruby/protocol/RubySlicc_MemControl.sm
@@ -89,7 +89,9 @@ structure(MemoryMsg, desc="...", interface="Message") {
     if ((MessageSize == MessageSizeType:Response_Data) ||
         (MessageSize == MessageSizeType:Writeback_Data))  {
       WriteMask read_mask;
-      read_mask.setMask(addressOffset(addr, makeLineAddress(addr)), Len, true);
+      read_mask.setBlockSize(mask.getBlockSize());
+      read_mask.setMask(addressOffset(addr,
+        makeLineAddress(addr, mask.getBlockSize())), Len, true);
       if (MessageSize != MessageSizeType:Writeback_Data) {
         read_mask.setInvertedMask(mask);
       }
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm
index 4e0e4f4511..848d16491d 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -94,7 +94,7 @@ structure (Set, external = "yes", non_obj="yes") {
   NodeID smallestElement();
 }
 
-structure (NetDest, external = "yes", non_obj="yes") {
+structure (NetDest, external = "yes", non_obj="yes", implicit_ctor="m_ruby_system") {
   void setSize(int);
   void setSize(int, int);
   void add(NodeID);
diff --git a/src/mem/ruby/protocol/RubySlicc_Util.sm b/src/mem/ruby/protocol/RubySlicc_Util.sm
index 104c7c034c..93976bc4e1 100644
--- a/src/mem/ruby/protocol/RubySlicc_Util.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Util.sm
@@ -52,6 +52,7 @@ Addr intToAddress(int addr);
 int addressOffset(Addr addr, Addr base);
 int max_tokens();
 Addr makeLineAddress(Addr addr);
+Addr makeLineAddress(Addr addr, int cacheLineBits);
 int getOffset(Addr addr);
 int mod(int val, int mod);
 Addr bitSelect(Addr addr, int small, int big);
diff --git a/src/mem/ruby/protocol/chi/CHI-cache.sm b/src/mem/ruby/protocol/chi/CHI-cache.sm
index dcd142ea47..a644bbe506 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache.sm
@@ -574,7 +574,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
   ////////////////////////////////////////////////////////////////////////////
 
   // Cache block size
-  int blockSize, default="RubySystem::getBlockSizeBytes()";
+  int blockSize, default="m_ruby_system->getBlockSizeBytes()";
 
   // CacheEntry
   structure(CacheEntry, interface="AbstractCacheEntry") {
diff --git a/src/mem/ruby/protocol/chi/CHI-dvm-misc-node.sm b/src/mem/ruby/protocol/chi/CHI-dvm-misc-node.sm
index aa27c40964..f7616e9ec4 100644
--- a/src/mem/ruby/protocol/chi/CHI-dvm-misc-node.sm
+++ b/src/mem/ruby/protocol/chi/CHI-dvm-misc-node.sm
@@ -192,7 +192,7 @@ machine(MachineType:MiscNode, "CHI Misc Node for handling and distrbuting DVM op
   ////////////////////////////////////////////////////////////////////////////
 
   // Cache block size
-  int blockSize, default="RubySystem::getBlockSizeBytes()";
+  int blockSize, default="m_ruby_system->getBlockSizeBytes()";
 
   // Helper class for tracking expected response and data messages
   structure(ExpectedMap, external ="yes") {
diff --git a/src/mem/ruby/protocol/chi/CHI-mem.sm b/src/mem/ruby/protocol/chi/CHI-mem.sm
index 46f57456a5..58f22d2007 100644
--- a/src/mem/ruby/protocol/chi/CHI-mem.sm
+++ b/src/mem/ruby/protocol/chi/CHI-mem.sm
@@ -157,7 +157,7 @@ machine(MachineType:Memory, "Memory controller interface") :
   ////////////////////////////////////////////////////////////////////////////
 
   // Cache block size
-  int blockSize, default="RubySystem::getBlockSizeBytes()";
+  int blockSize, default="m_ruby_system->getBlockSizeBytes()";
 
   // TBE fields
   structure(TBE, desc="...") {
diff --git a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
index 0e00a60c28..1305deddce 100644
--- a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
+++ b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
@@ -59,6 +59,8 @@ namespace gem5
 namespace ruby
 {
 
+class RubySystem;
+
 class AbstractCacheEntry : public ReplaceableEntry
 {
   private:
@@ -78,16 +80,15 @@ class AbstractCacheEntry : public ReplaceableEntry
 
     // The methods below are those called by ruby runtime, add when it
     // is absolutely necessary and should all be virtual function.
-    virtual DataBlock&
+    [[noreturn]] virtual DataBlock&
     getDataBlk()
     {
         panic("getDataBlk() not implemented!");
-
-        // Dummy return to appease the compiler
-        static DataBlock b;
-        return b;
     }
 
+    virtual void initBlockSize(int block_size) { };
+    virtual void setRubySystem(RubySystem *rs) { };
+
     int validBlocks;
     virtual int& getNumValidBlocks()
     {
diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc
index 36092387ac..0bcc662629 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.cc
+++ b/src/mem/ruby/slicc_interface/AbstractController.cc
@@ -89,6 +89,9 @@ AbstractController::init()
         getMemReqQueue()->setConsumer(this);
     }
 
+    downstreamDestinations.setRubySystem(m_ruby_system);
+    upstreamDestinations.setRubySystem(m_ruby_system);
+
     // Initialize the addr->downstream machine mappings. Multiple machines
     // in downstream_destinations can have the same address range if they have
     // different types. If this is the case, mapAddressToDownstreamMachine
@@ -268,7 +271,7 @@ AbstractController::serviceMemoryQueue()
     }
 
     const MemoryMsg *mem_msg = (const MemoryMsg*)mem_queue->peek();
-    unsigned int req_size = RubySystem::getBlockSizeBytes();
+    unsigned int req_size = m_ruby_system->getBlockSizeBytes();
     if (mem_msg->m_Len > 0) {
         req_size = mem_msg->m_Len;
     }
@@ -294,7 +297,7 @@ AbstractController::serviceMemoryQueue()
     SenderState *s = new SenderState(mem_msg->m_Sender);
     pkt->pushSenderState(s);
 
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         // Use functional rather than timing accesses during warmup
         mem_queue->dequeue(clockEdge());
         memoryPort.sendFunctional(pkt);
@@ -382,7 +385,10 @@ AbstractController::recvTimingResp(PacketPtr pkt)
         return false;
     }
 
-    std::shared_ptr<MemoryMsg> msg = std::make_shared<MemoryMsg>(clockEdge());
+    int blk_size = m_ruby_system->getBlockSizeBytes();
+
+    std::shared_ptr<MemoryMsg> msg =
+        std::make_shared<MemoryMsg>(clockEdge(), blk_size, m_ruby_system);
     (*msg).m_addr = pkt->getAddr();
     (*msg).m_Sender = m_machineID;
 
@@ -396,7 +402,7 @@ AbstractController::recvTimingResp(PacketPtr pkt)
 
         // Copy data from the packet
         (*msg).m_DataBlk.setData(pkt->getPtr<uint8_t>(), 0,
-                                 RubySystem::getBlockSizeBytes());
+                                 m_ruby_system->getBlockSizeBytes());
     } else if (pkt->isWrite()) {
         (*msg).m_Type = MemoryRequestType_MEMORY_WB;
         (*msg).m_MessageSize = MessageSizeType_Writeback_Control;
@@ -404,7 +410,8 @@ AbstractController::recvTimingResp(PacketPtr pkt)
         panic("Incorrect packet type received from memory controller!");
     }
 
-    memRspQueue->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1)));
+    memRspQueue->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1)),
+        m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());
     delete pkt;
     return true;
 }
@@ -471,6 +478,45 @@ AbstractController::sendRetryRespToMem() {
     }
 }
 
+Addr
+AbstractController::getOffset(Addr addr) const
+{
+    return ruby::getOffset(addr, m_ruby_system->getBlockSizeBits());
+}
+
+Addr
+AbstractController::makeLineAddress(Addr addr) const
+{
+    return ruby::makeLineAddress(addr, m_ruby_system->getBlockSizeBits());
+}
+
+std::string
+AbstractController::printAddress(Addr addr) const
+{
+    return ruby::printAddress(addr, m_ruby_system->getBlockSizeBits());
+}
+
+NetDest
+AbstractController::broadcast(MachineType type)
+{
+    assert(m_ruby_system != nullptr);
+    NodeID type_count = m_ruby_system->MachineType_base_count(type);
+
+    NetDest dest;
+    for (NodeID i = 0; i < type_count; i++) {
+        MachineID mach = {type, i};
+        dest.add(mach);
+    }
+    return dest;
+}
+
+int
+AbstractController::machineCount(MachineType machType)
+{
+    assert(m_ruby_system != nullptr);
+    return m_ruby_system->MachineType_base_count(machType);
+}
+
 bool
 AbstractController::MemoryPort::recvTimingResp(PacketPtr pkt)
 {
diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh
index ce6a6972af..79f67073a6 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.hh
+++ b/src/mem/ruby/slicc_interface/AbstractController.hh
@@ -72,6 +72,7 @@ namespace ruby
 class Network;
 class GPUCoalescer;
 class DMASequencer;
+class RubySystem;
 
 // used to communicate that an in_port peeked the wrong message type
 class RejectException: public std::exception
@@ -229,6 +230,11 @@ class AbstractController : public ClockedObject, public Consumer
     /** List of upstream destinations (towards the CPU) */
     const NetDest& allUpstreamDest() const { return upstreamDestinations; }
 
+    // Helper methods for commonly used functions called in common/address.hh
+    Addr getOffset(Addr addr) const;
+    Addr makeLineAddress(Addr addr) const;
+    std::string printAddress(Addr addr) const;
+
   protected:
     //! Profiles original cache requests including PUTs
     void profileRequest(const std::string &request);
@@ -452,6 +458,13 @@ class AbstractController : public ClockedObject, public Consumer
         {}
     };
 
+    RubySystem *m_ruby_system = nullptr;
+
+    // Formerly in RubySlicc_ComponentMapping.hh. Moved here to access
+    // RubySystem pointer.
+    NetDest broadcast(MachineType type);
+    int machineCount(MachineType machType);
+
   private:
     /** The address range to which the controller responds on the CPU side. */
     const AddrRangeList addrRanges;
diff --git a/src/mem/ruby/slicc_interface/Message.hh b/src/mem/ruby/slicc_interface/Message.hh
index 5c824c4a38..31fb5e8e92 100644
--- a/src/mem/ruby/slicc_interface/Message.hh
+++ b/src/mem/ruby/slicc_interface/Message.hh
@@ -62,10 +62,12 @@ typedef std::shared_ptr<Message> MsgPtr;
 class Message
 {
   public:
-    Message(Tick curTime)
-        : m_time(curTime),
+    Message(Tick curTime, int block_size, const RubySystem *rs)
+        : m_block_size(block_size),
+          m_time(curTime),
           m_LastEnqueueTime(curTime),
-          m_DelayedTicks(0), m_msg_counter(0)
+          m_DelayedTicks(0), m_msg_counter(0),
+          p_ruby_system(rs)
     { }
 
     Message(const Message &other) = default;
@@ -121,6 +123,9 @@ class Message
     int getVnet() const { return vnet; }
     void setVnet(int net) { vnet = net; }
 
+  protected:
+    int m_block_size = 0;
+
   private:
     Tick m_time;
     Tick m_LastEnqueueTime; // my last enqueue time
@@ -130,6 +135,9 @@ class Message
     // Variables for required network traversal
     int incoming_link;
     int vnet;
+
+    // Needed to call MacheinType_base_count/level
+    const RubySystem *p_ruby_system = nullptr;
 };
 
 inline bool
diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh
index a258a18f9a..58eae229be 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -86,11 +86,12 @@ class RubyRequest : public Message
     bool m_isSLCSet;
     bool m_isSecure;
 
-    RubyRequest(Tick curTime, uint64_t _paddr, int _len,
+    RubyRequest(Tick curTime, int block_size, RubySystem *rs,
+        uint64_t _paddr, int _len,
         uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
         PacketPtr _pkt, PrefetchBit _pb = PrefetchBit_No,
         ContextID _proc_id = 100, ContextID _core_id = 99)
-        : Message(curTime),
+        : Message(curTime, block_size, rs),
           m_PhysicalAddress(_paddr),
           m_Type(_type),
           m_ProgramCounter(_pc),
@@ -99,13 +100,16 @@ class RubyRequest : public Message
           m_Prefetch(_pb),
           m_pkt(_pkt),
           m_contextId(_core_id),
+          m_writeMask(block_size),
+          m_WTData(block_size),
           m_htmFromTransaction(false),
           m_htmTransactionUid(0),
           m_isTlbi(false),
           m_tlbiTransactionUid(0),
           m_isSecure(m_pkt ? m_pkt->req->isSecure() : false)
     {
-        m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        int block_size_bits = floorLog2(block_size);
+        m_LineAddress = makeLineAddress(m_PhysicalAddress, block_size_bits);
         if (_pkt) {
             m_isGLCSet = m_pkt->req->isGLCSet();
             m_isSLCSet = m_pkt->req->isSLCSet();
@@ -116,10 +120,10 @@ class RubyRequest : public Message
     }
 
     /** RubyRequest for memory management commands */
-    RubyRequest(Tick curTime,
+    RubyRequest(Tick curTime, int block_size, RubySystem *rs,
         uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
         PacketPtr _pkt, ContextID _proc_id, ContextID _core_id)
-        : Message(curTime),
+        : Message(curTime, block_size, rs),
           m_PhysicalAddress(0),
           m_Type(_type),
           m_ProgramCounter(_pc),
@@ -128,6 +132,8 @@ class RubyRequest : public Message
           m_Prefetch(PrefetchBit_No),
           m_pkt(_pkt),
           m_contextId(_core_id),
+          m_writeMask(block_size),
+          m_WTData(block_size),
           m_htmFromTransaction(false),
           m_htmTransactionUid(0),
           m_isTlbi(false),
@@ -144,14 +150,14 @@ class RubyRequest : public Message
         }
     }
 
-    RubyRequest(Tick curTime, uint64_t _paddr, int _len,
-        uint64_t _pc, RubyRequestType _type,
+    RubyRequest(Tick curTime, int block_size, RubySystem *rs,
+        uint64_t _paddr, int _len, uint64_t _pc, RubyRequestType _type,
         RubyAccessMode _access_mode, PacketPtr _pkt, PrefetchBit _pb,
         unsigned _proc_id, unsigned _core_id,
         int _wm_size, std::vector<bool> & _wm_mask,
         DataBlock & _Data,
         uint64_t _instSeqNum = 0)
-        : Message(curTime),
+        : Message(curTime, block_size, rs),
           m_PhysicalAddress(_paddr),
           m_Type(_type),
           m_ProgramCounter(_pc),
@@ -170,7 +176,8 @@ class RubyRequest : public Message
           m_tlbiTransactionUid(0),
           m_isSecure(m_pkt->req->isSecure())
     {
-        m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        int block_size_bits = floorLog2(block_size);
+        m_LineAddress = makeLineAddress(m_PhysicalAddress, block_size_bits);
         if (_pkt) {
             m_isGLCSet = m_pkt->req->isGLCSet();
             m_isSLCSet = m_pkt->req->isSLCSet();
@@ -180,15 +187,15 @@ class RubyRequest : public Message
         }
     }
 
-    RubyRequest(Tick curTime, uint64_t _paddr, int _len,
-        uint64_t _pc, RubyRequestType _type,
+    RubyRequest(Tick curTime, int block_size, RubySystem *rs,
+        uint64_t _paddr, int _len, uint64_t _pc, RubyRequestType _type,
         RubyAccessMode _access_mode, PacketPtr _pkt, PrefetchBit _pb,
         unsigned _proc_id, unsigned _core_id,
         int _wm_size, std::vector<bool> & _wm_mask,
         DataBlock & _Data,
         std::vector< std::pair<int,AtomicOpFunctor*> > _atomicOps,
         uint64_t _instSeqNum = 0)
-        : Message(curTime),
+        : Message(curTime, block_size, rs),
           m_PhysicalAddress(_paddr),
           m_Type(_type),
           m_ProgramCounter(_pc),
@@ -207,7 +214,8 @@ class RubyRequest : public Message
           m_tlbiTransactionUid(0),
           m_isSecure(m_pkt->req->isSecure())
     {
-        m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        int block_size_bits = floorLog2(block_size);
+        m_LineAddress = makeLineAddress(m_PhysicalAddress, block_size_bits);
         if (_pkt) {
             m_isGLCSet = m_pkt->req->isGLCSet();
             m_isSLCSet = m_pkt->req->isSLCSet();
@@ -218,7 +226,12 @@ class RubyRequest : public Message
         }
     }
 
-    RubyRequest(Tick curTime) : Message(curTime) {}
+    RubyRequest(Tick curTime, int block_size, RubySystem *rs)
+        : Message(curTime, block_size, rs),
+          m_writeMask(block_size),
+          m_WTData(block_size)
+    {
+    }
     MsgPtr clone() const
     { return std::shared_ptr<Message>(new RubyRequest(*this)); }
 
diff --git a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
index 9a433d1cee..1195089fc3 100644
--- a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
+++ b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
@@ -41,17 +41,6 @@ namespace gem5
 namespace ruby
 {
 
-inline NetDest
-broadcast(MachineType type)
-{
-    NetDest dest;
-    for (NodeID i = 0; i < MachineType_base_count(type); i++) {
-        MachineID mach = {type, i};
-        dest.add(mach);
-    }
-    return dest;
-}
-
 inline MachineID
 mapAddressToRange(Addr addr, MachineType type, int low_bit,
                   int num_bits, int cluster_id = 0)
@@ -77,12 +66,6 @@ machineIDToMachineType(MachineID machID)
     return machID.type;
 }
 
-inline int
-machineCount(MachineType machType)
-{
-    return MachineType_base_count(machType);
-}
-
 inline MachineID
 createMachineID(MachineType type, NodeID id)
 {
diff --git a/src/mem/ruby/slicc_interface/RubySlicc_Util.hh b/src/mem/ruby/slicc_interface/RubySlicc_Util.hh
index 8df56c7013..f4a49463a8 100644
--- a/src/mem/ruby/slicc_interface/RubySlicc_Util.hh
+++ b/src/mem/ruby/slicc_interface/RubySlicc_Util.hh
@@ -233,8 +233,9 @@ addressOffset(Addr addr, Addr base)
 inline bool
 testAndRead(Addr addr, DataBlock& blk, Packet *pkt)
 {
-    Addr pktLineAddr = makeLineAddress(pkt->getAddr());
-    Addr lineAddr = makeLineAddress(addr);
+    int block_size_bits = floorLog2(blk.getBlockSize());
+    Addr pktLineAddr = makeLineAddress(pkt->getAddr(), block_size_bits);
+    Addr lineAddr = makeLineAddress(addr, block_size_bits);
 
     if (pktLineAddr == lineAddr) {
         uint8_t *data = pkt->getPtr<uint8_t>();
@@ -259,8 +260,10 @@ testAndRead(Addr addr, DataBlock& blk, Packet *pkt)
 inline bool
 testAndReadMask(Addr addr, DataBlock& blk, WriteMask& mask, Packet *pkt)
 {
-    Addr pktLineAddr = makeLineAddress(pkt->getAddr());
-    Addr lineAddr = makeLineAddress(addr);
+    assert(blk.getBlockSize() == mask.getBlockSize());
+    int block_size_bits = floorLog2(blk.getBlockSize());
+    Addr pktLineAddr = makeLineAddress(pkt->getAddr(), block_size_bits);
+    Addr lineAddr = makeLineAddress(addr, block_size_bits);
 
     if (pktLineAddr == lineAddr) {
         uint8_t *data = pkt->getPtr<uint8_t>();
@@ -288,8 +291,9 @@ testAndReadMask(Addr addr, DataBlock& blk, WriteMask& mask, Packet *pkt)
 inline bool
 testAndWrite(Addr addr, DataBlock& blk, Packet *pkt)
 {
-    Addr pktLineAddr = makeLineAddress(pkt->getAddr());
-    Addr lineAddr = makeLineAddress(addr);
+    int block_size_bits = floorLog2(blk.getBlockSize());
+    Addr pktLineAddr = makeLineAddress(pkt->getAddr(), block_size_bits);
+    Addr lineAddr = makeLineAddress(addr, block_size_bits);
 
     if (pktLineAddr == lineAddr) {
         const uint8_t *data = pkt->getConstPtr<uint8_t>();
diff --git a/src/mem/ruby/structures/ALUFreeListArray.cc b/src/mem/ruby/structures/ALUFreeListArray.cc
index 87b5cbfbd2..3e25e5b599 100644
--- a/src/mem/ruby/structures/ALUFreeListArray.cc
+++ b/src/mem/ruby/structures/ALUFreeListArray.cc
@@ -57,10 +57,10 @@ namespace ruby
 *       - The same line has been accessed in the past accessLatency ticks
 */
 
-ALUFreeListArray::ALUFreeListArray(unsigned int num_ALUs, Tick access_latency)
+ALUFreeListArray::ALUFreeListArray(unsigned int num_ALUs, Cycles access_clocks)
 {
     this->numALUs = num_ALUs;
-    this->accessLatency = access_latency;
+    this->accessClocks = access_clocks;
 }
 
 bool ALUFreeListArray::tryAccess(Addr addr)
@@ -85,7 +85,7 @@ bool ALUFreeListArray::tryAccess(Addr addr)
         }
 
         // Block access if the line is already being used
-        if (record.lineAddr == makeLineAddress(addr)) {
+        if (record.lineAddr == makeLineAddress(addr, m_block_size_bits)) {
             return false;
         }
     }
@@ -99,7 +99,9 @@ void ALUFreeListArray::reserve(Addr addr)
     // the access is valid
 
     // Add record to queue
-    accessQueue.push_front(AccessRecord(makeLineAddress(addr), curTick()));
+    accessQueue.push_front(
+        AccessRecord(makeLineAddress(addr, m_block_size_bits), curTick())
+    );
 }
 
 } // namespace ruby
diff --git a/src/mem/ruby/structures/ALUFreeListArray.hh b/src/mem/ruby/structures/ALUFreeListArray.hh
index bed1b00b5c..5c4fdd95f9 100644
--- a/src/mem/ruby/structures/ALUFreeListArray.hh
+++ b/src/mem/ruby/structures/ALUFreeListArray.hh
@@ -32,6 +32,7 @@
 
 #include <deque>
 
+#include "base/intmath.hh"
 #include "mem/ruby/common/TypeDefines.hh"
 #include "sim/cur_tick.hh"
 
@@ -45,7 +46,8 @@ class ALUFreeListArray
 {
   private:
     unsigned int numALUs;
-    Tick accessLatency;
+    Cycles accessClocks;
+    Tick accessLatency = 0;
 
     class AccessRecord
     {
@@ -62,14 +64,33 @@ class ALUFreeListArray
     // Queue of accesses from past accessLatency cycles
     std::deque<AccessRecord> accessQueue;
 
+    int m_block_size_bits = 0;
+
   public:
-    ALUFreeListArray(unsigned int num_ALUs, Tick access_latency);
+    ALUFreeListArray(unsigned int num_ALUs, Cycles access_clocks);
 
     bool tryAccess(Addr addr);
 
     void reserve(Addr addr);
 
-    Tick getLatency() const { return accessLatency; }
+    Tick
+    getLatency() const
+    {
+        assert(accessLatency > 0);
+        return accessLatency;
+    }
+
+    void
+    setClockPeriod(Tick clockPeriod)
+    {
+        accessLatency = accessClocks * clockPeriod;
+    }
+
+    void
+    setBlockSize(int block_size)
+    {
+        m_block_size_bits = floorLog2(block_size);
+    }
 };
 
 } // namespace ruby
diff --git a/src/mem/ruby/structures/BankedArray.cc b/src/mem/ruby/structures/BankedArray.cc
index 0f01d5c396..2c2202dec5 100644
--- a/src/mem/ruby/structures/BankedArray.cc
+++ b/src/mem/ruby/structures/BankedArray.cc
@@ -42,8 +42,7 @@ namespace ruby
 {
 
 BankedArray::BankedArray(unsigned int banks, Cycles accessLatency,
-                         unsigned int startIndexBit, RubySystem *rs)
-    : m_ruby_system(rs)
+                         unsigned int startIndexBit)
 {
     this->banks = banks;
     this->accessLatency = accessLatency;
@@ -78,6 +77,8 @@ BankedArray::reserve(int64_t idx)
     if (accessLatency == 0)
         return;
 
+    assert(clockPeriod > 0);
+
     unsigned int bank = mapIndexToBank(idx);
     assert(bank < banks);
 
@@ -95,7 +96,7 @@ BankedArray::reserve(int64_t idx)
     busyBanks[bank].idx = idx;
     busyBanks[bank].startAccess = curTick();
     busyBanks[bank].endAccess = curTick() +
-        (accessLatency-1) * m_ruby_system->clockPeriod();
+        (accessLatency-1) * clockPeriod;
 }
 
 unsigned int
diff --git a/src/mem/ruby/structures/BankedArray.hh b/src/mem/ruby/structures/BankedArray.hh
index c757759296..ecc984a617 100644
--- a/src/mem/ruby/structures/BankedArray.hh
+++ b/src/mem/ruby/structures/BankedArray.hh
@@ -48,6 +48,7 @@ class BankedArray
   private:
     unsigned int banks;
     Cycles accessLatency;
+    Tick clockPeriod = 0;
     unsigned int bankBits;
     unsigned int startIndexBit;
     RubySystem *m_ruby_system;
@@ -69,7 +70,7 @@ class BankedArray
 
   public:
     BankedArray(unsigned int banks, Cycles accessLatency,
-                unsigned int startIndexBit, RubySystem *rs);
+                unsigned int startIndexBit);
 
     // Note: We try the access based on the cache index, not the address
     // This is so we don't get aliasing on blocks being replaced
@@ -78,6 +79,8 @@ class BankedArray
     void reserve(int64_t idx);
 
     Cycles getLatency() const { return accessLatency; }
+
+    void setClockPeriod(Tick _clockPeriod) { clockPeriod = _clockPeriod; }
 };
 
 } // namespace ruby
diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc
index 90d67fb29b..6bc35bac7d 100644
--- a/src/mem/ruby/structures/CacheMemory.cc
+++ b/src/mem/ruby/structures/CacheMemory.cc
@@ -69,12 +69,9 @@ operator<<(std::ostream& out, const CacheMemory& obj)
 
 CacheMemory::CacheMemory(const Params &p)
     : SimObject(p),
-    dataArray(p.dataArrayBanks, p.dataAccessLatency,
-              p.start_index_bit, p.ruby_system),
-    tagArray(p.tagArrayBanks, p.tagAccessLatency,
-             p.start_index_bit, p.ruby_system),
-    atomicALUArray(p.atomicALUs, p.atomicLatency *
-             p.ruby_system->clockPeriod()),
+    dataArray(p.dataArrayBanks, p.dataAccessLatency, p.start_index_bit),
+    tagArray(p.tagArrayBanks, p.tagAccessLatency, p.start_index_bit),
+    atomicALUArray(p.atomicALUs, p.atomicLatency),
     cacheMemoryStats(this)
 {
     m_cache_size = p.size;
@@ -88,12 +85,25 @@ CacheMemory::CacheMemory(const Params &p)
                                     m_replacementPolicy_ptr) ? true : false;
 }
 
+void
+CacheMemory::setRubySystem(RubySystem* rs)
+{
+    dataArray.setClockPeriod(rs->clockPeriod());
+    tagArray.setClockPeriod(rs->clockPeriod());
+    atomicALUArray.setClockPeriod(rs->clockPeriod());
+    atomicALUArray.setBlockSize(rs->getBlockSizeBytes());
+
+    if (m_block_size == 0) {
+        m_block_size = rs->getBlockSizeBytes();
+    }
+
+    m_ruby_system = rs;
+}
+
 void
 CacheMemory::init()
 {
-    if (m_block_size == 0) {
-        m_block_size = RubySystem::getBlockSizeBytes();
-    }
+    assert(m_block_size != 0);
     m_cache_num_sets = (m_cache_size / m_cache_assoc) / m_block_size;
     assert(m_cache_num_sets > 1);
     m_cache_num_set_bits = floorLog2(m_cache_num_sets);
@@ -286,6 +296,9 @@ CacheMemory::allocate(Addr address, AbstractCacheEntry *entry)
     assert(cacheAvail(address));
     DPRINTF(RubyCache, "allocating address: %#x\n", address);
 
+    entry->initBlockSize(m_block_size);
+    entry->setRubySystem(m_ruby_system);
+
     // Find the first open slot
     int64_t cacheSet = addressToCacheSet(address);
     std::vector<AbstractCacheEntry*> &set = m_cache[cacheSet];
diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh
index de7c327f63..912ae22d1f 100644
--- a/src/mem/ruby/structures/CacheMemory.hh
+++ b/src/mem/ruby/structures/CacheMemory.hh
@@ -154,6 +154,8 @@ class CacheMemory : public SimObject
     void htmAbortTransaction();
     void htmCommitTransaction();
 
+    void setRubySystem(RubySystem* rs);
+
   public:
     int getCacheSize() const { return m_cache_size; }
     int getCacheAssoc() const { return m_cache_assoc; }
@@ -213,6 +215,14 @@ class CacheMemory : public SimObject
      */
     bool m_use_occupancy;
 
+    RubySystem *m_ruby_system = nullptr;
+
+    Addr
+    makeLineAddress(Addr addr) const
+    {
+        return ruby::makeLineAddress(addr, floorLog2(m_block_size));
+    }
+
     private:
       struct CacheMemoryStats : public statistics::Group
       {
diff --git a/src/mem/ruby/structures/DirectoryMemory.cc b/src/mem/ruby/structures/DirectoryMemory.cc
index 620254b82c..7469f72451 100644
--- a/src/mem/ruby/structures/DirectoryMemory.cc
+++ b/src/mem/ruby/structures/DirectoryMemory.cc
@@ -64,12 +64,14 @@ DirectoryMemory::DirectoryMemory(const Params &p)
     }
     m_size_bits = floorLog2(m_size_bytes);
     m_num_entries = 0;
+    m_block_size = p.block_size;
+    m_ruby_system = p.ruby_system;
 }
 
 void
 DirectoryMemory::init()
 {
-    m_num_entries = m_size_bytes / RubySystem::getBlockSizeBytes();
+    m_num_entries = m_size_bytes / m_block_size;
     m_entries = new AbstractCacheEntry*[m_num_entries];
     for (int i = 0; i < m_num_entries; i++)
         m_entries[i] = NULL;
@@ -108,7 +110,7 @@ DirectoryMemory::mapAddressToLocalIdx(Addr address)
         }
         ret += r.size();
     }
-    return ret >> RubySystem::getBlockSizeBits();
+    return ret >> (floorLog2(m_block_size));
 }
 
 AbstractCacheEntry*
@@ -133,6 +135,8 @@ DirectoryMemory::allocate(Addr address, AbstractCacheEntry *entry)
     assert(idx < m_num_entries);
     assert(m_entries[idx] == NULL);
     entry->changePermission(AccessPermission_Read_Only);
+    entry->initBlockSize(m_block_size);
+    entry->setRubySystem(m_ruby_system);
     m_entries[idx] = entry;
 
     return entry;
diff --git a/src/mem/ruby/structures/DirectoryMemory.hh b/src/mem/ruby/structures/DirectoryMemory.hh
index 8a4532864d..6e77e2a4ca 100644
--- a/src/mem/ruby/structures/DirectoryMemory.hh
+++ b/src/mem/ruby/structures/DirectoryMemory.hh
@@ -104,6 +104,9 @@ class DirectoryMemory : public SimObject
     uint64_t m_size_bytes;
     uint64_t m_size_bits;
     uint64_t m_num_entries;
+    uint32_t m_block_size;
+
+    RubySystem *m_ruby_system = nullptr;
 
     /**
      * The address range for which the directory responds. Normally
diff --git a/src/mem/ruby/structures/DirectoryMemory.py b/src/mem/ruby/structures/DirectoryMemory.py
index 85f05367cf..202617bceb 100644
--- a/src/mem/ruby/structures/DirectoryMemory.py
+++ b/src/mem/ruby/structures/DirectoryMemory.py
@@ -49,3 +49,7 @@ class RubyDirectoryMemory(SimObject):
     addr_ranges = VectorParam.AddrRange(
         Parent.addr_ranges, "Address range this directory responds to"
     )
+    block_size = Param.UInt32(
+        "Size of a block in bytes. Usually same as cache line size."
+    )
+    ruby_system = Param.RubySystem(Parent.any, "")
diff --git a/src/mem/ruby/structures/PerfectCacheMemory.hh b/src/mem/ruby/structures/PerfectCacheMemory.hh
index 664d10f202..0966ca80d2 100644
--- a/src/mem/ruby/structures/PerfectCacheMemory.hh
+++ b/src/mem/ruby/structures/PerfectCacheMemory.hh
@@ -74,6 +74,8 @@ class PerfectCacheMemory
   public:
     PerfectCacheMemory();
 
+    void setBlockSize(const int block_size) { m_block_size = block_size; }
+
     // tests to see if an address is present in the cache
     bool isTagPresent(Addr address) const;
 
@@ -108,6 +110,8 @@ class PerfectCacheMemory
 
     // Data Members (m_prefix)
     std::unordered_map<Addr, PerfectCacheLineState<ENTRY> > m_map;
+
+    int m_block_size = 0;
 };
 
 template<class ENTRY>
@@ -130,7 +134,7 @@ template<class ENTRY>
 inline bool
 PerfectCacheMemory<ENTRY>::isTagPresent(Addr address) const
 {
-    return m_map.count(makeLineAddress(address)) > 0;
+    return m_map.count(makeLineAddress(address, floorLog2(m_block_size))) > 0;
 }
 
 template<class ENTRY>
@@ -149,7 +153,8 @@ PerfectCacheMemory<ENTRY>::allocate(Addr address)
     PerfectCacheLineState<ENTRY> line_state;
     line_state.m_permission = AccessPermission_Invalid;
     line_state.m_entry = ENTRY();
-    m_map[makeLineAddress(address)] = line_state;
+    Addr line_addr = makeLineAddress(address, floorLog2(m_block_size));
+    m_map.emplace(line_addr, line_state);
 }
 
 // deallocate entry
@@ -157,7 +162,8 @@ template<class ENTRY>
 inline void
 PerfectCacheMemory<ENTRY>::deallocate(Addr address)
 {
-    [[maybe_unused]] auto num_erased = m_map.erase(makeLineAddress(address));
+    Addr line_addr = makeLineAddress(address, floorLog2(m_block_size));
+    [[maybe_unused]] auto num_erased = m_map.erase(line_addr);
     assert(num_erased == 1);
 }
 
@@ -175,7 +181,8 @@ template<class ENTRY>
 inline ENTRY*
 PerfectCacheMemory<ENTRY>::lookup(Addr address)
 {
-    return &m_map[makeLineAddress(address)].m_entry;
+    Addr line_addr = makeLineAddress(address, floorLog2(m_block_size));
+    return &m_map[line_addr].m_entry;
 }
 
 // looks an address up in the cache
@@ -183,14 +190,16 @@ template<class ENTRY>
 inline const ENTRY*
 PerfectCacheMemory<ENTRY>::lookup(Addr address) const
 {
-    return &m_map[makeLineAddress(address)].m_entry;
+    Addr line_addr = makeLineAddress(address, floorLog2(m_block_size));
+    return &m_map[line_addr].m_entry;
 }
 
 template<class ENTRY>
 inline AccessPermission
 PerfectCacheMemory<ENTRY>::getPermission(Addr address) const
 {
-    return m_map[makeLineAddress(address)].m_permission;
+    Addr line_addr = makeLineAddress(address, floorLog2(m_block_size));
+    return m_map[line_addr].m_permission;
 }
 
 template<class ENTRY>
@@ -198,8 +207,8 @@ inline void
 PerfectCacheMemory<ENTRY>::changePermission(Addr address,
                                             AccessPermission new_perm)
 {
-    Addr line_address = makeLineAddress(address);
-    PerfectCacheLineState<ENTRY>& line_state = m_map[line_address];
+    Addr line_addr = makeLineAddress(address, floorLog2(m_block_size));
+    PerfectCacheLineState<ENTRY>& line_state = m_map[line_addr];
     line_state.m_permission = new_perm;
 }
 
diff --git a/src/mem/ruby/structures/PersistentTable.hh b/src/mem/ruby/structures/PersistentTable.hh
index 5382269273..1162e1dda1 100644
--- a/src/mem/ruby/structures/PersistentTable.hh
+++ b/src/mem/ruby/structures/PersistentTable.hh
@@ -63,6 +63,12 @@ class PersistentTable
     // Destructor
     ~PersistentTable();
 
+    void
+    setBlockSize(int block_size)
+    {
+        m_block_size_bits = floorLog2(block_size);
+    }
+
     // Public Methods
     void persistentRequestLock(Addr address, MachineID locker,
                                AccessType type);
@@ -82,9 +88,17 @@ class PersistentTable
     PersistentTable(const PersistentTable& obj);
     PersistentTable& operator=(const PersistentTable& obj);
 
+    int m_block_size_bits = 0;
+
     // Data Members (m_prefix)
     typedef std::unordered_map<Addr, PersistentTableEntry> AddressMap;
     AddressMap m_map;
+
+    Addr
+    makeLineAddress(Addr addr) const
+    {
+        return ruby::makeLineAddress(addr, m_block_size_bits);
+    }
 };
 
 inline std::ostream&
diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py
index 2f457f5c4a..4b1023fc61 100644
--- a/src/mem/ruby/structures/RubyCache.py
+++ b/src/mem/ruby/structures/RubyCache.py
@@ -54,4 +54,3 @@ class RubyCache(SimObject):
     dataAccessLatency = Param.Cycles(1, "cycles for a data array access")
     tagAccessLatency = Param.Cycles(1, "cycles for a tag array access")
     resourceStalls = Param.Bool(False, "stall if there is a resource failure")
-    ruby_system = Param.RubySystem(Parent.any, "")
diff --git a/src/mem/ruby/structures/RubyPrefetcher.cc b/src/mem/ruby/structures/RubyPrefetcher.cc
index e45eff2c2f..bffcfe2327 100644
--- a/src/mem/ruby/structures/RubyPrefetcher.cc
+++ b/src/mem/ruby/structures/RubyPrefetcher.cc
@@ -56,13 +56,15 @@ namespace ruby
 
 RubyPrefetcher::RubyPrefetcher(const Params &p)
     : SimObject(p), m_num_streams(p.num_streams),
-    m_array(p.num_streams), m_train_misses(p.train_misses),
+    m_array(p.num_streams, p.block_size), m_train_misses(p.train_misses),
     m_num_startup_pfs(p.num_startup_pfs),
     unitFilter(p.unit_filter),
     negativeFilter(p.unit_filter),
     nonUnitFilter(p.nonunit_filter),
     m_prefetch_cross_pages(p.cross_page),
     pageShift(p.page_shift),
+    m_block_size_bits(floorLog2(p.block_size)),
+    m_block_size_bytes(p.block_size),
     rubyPrefetcherStats(this)
 {
     assert(m_num_streams > 0);
@@ -90,7 +92,7 @@ void
 RubyPrefetcher::observeMiss(Addr address, const RubyRequestType& type)
 {
     DPRINTF(RubyPrefetcher, "Observed miss for %#x\n", address);
-    Addr line_addr = makeLineAddress(address);
+    Addr line_addr = makeLineAddress(address, m_block_size_bits);
     rubyPrefetcherStats.numMissObserved++;
 
     // check to see if we have already issued a prefetch for this block
@@ -214,7 +216,7 @@ RubyPrefetcher::initializeStream(Addr address, int stride,
 
     // initialize the stream prefetcher
     PrefetchEntry *mystream = &(m_array[index]);
-    mystream->m_address = makeLineAddress(address);
+    mystream->m_address = makeLineAddress(address, m_block_size_bits);
     mystream->m_stride = stride;
     mystream->m_use_time = m_controller->curCycle();
     mystream->m_is_valid = true;
@@ -222,7 +224,7 @@ RubyPrefetcher::initializeStream(Addr address, int stride,
 
     // create a number of initial prefetches for this stream
     Addr page_addr = pageAddress(mystream->m_address);
-    Addr line_addr = makeLineAddress(mystream->m_address);
+    Addr line_addr = makeLineAddress(mystream->m_address, m_block_size_bits);
 
     // insert a number of prefetches into the prefetch table
     for (int k = 0; k < m_num_startup_pfs; k++) {
@@ -312,8 +314,7 @@ RubyPrefetcher::accessNonunitFilter(Addr line_addr,
                         // This stride HAS to be the multiplicative constant of
                         // dataBlockBytes (bc makeNextStrideAddress is
                         // calculated based on this multiplicative constant!)
-                        const int stride = entry.stride /
-                            RubySystem::getBlockSizeBytes();
+                        const int stride = entry.stride / m_block_size_bytes;
 
                         // clear this filter entry
                         entry.clear();
diff --git a/src/mem/ruby/structures/RubyPrefetcher.hh b/src/mem/ruby/structures/RubyPrefetcher.hh
index 51e1b3c480..5627410713 100644
--- a/src/mem/ruby/structures/RubyPrefetcher.hh
+++ b/src/mem/ruby/structures/RubyPrefetcher.hh
@@ -68,10 +68,10 @@ class PrefetchEntry
 {
     public:
         /// constructor
-        PrefetchEntry()
+        PrefetchEntry(int block_size)
         {
             // default: 1 cache-line stride
-            m_stride   = (1 << RubySystem::getBlockSizeBits());
+            m_stride   = (1 << floorLog2(block_size));
             m_use_time = Cycles(0);
             m_is_valid = false;
         }
@@ -239,6 +239,16 @@ class RubyPrefetcher : public SimObject
 
         const unsigned pageShift;
 
+        int m_block_size_bits = 0;
+        int m_block_size_bytes = 0;
+
+        Addr
+        makeNextStrideAddress(Addr addr, int stride) const
+        {
+            return ruby::makeNextStrideAddress(addr, stride,
+                                               m_block_size_bytes);
+        }
+
         struct RubyPrefetcherStats : public statistics::Group
         {
             RubyPrefetcherStats(statistics::Group *parent);
diff --git a/src/mem/ruby/structures/RubyPrefetcher.py b/src/mem/ruby/structures/RubyPrefetcher.py
index d4189ae7d5..155b7c314d 100644
--- a/src/mem/ruby/structures/RubyPrefetcher.py
+++ b/src/mem/ruby/structures/RubyPrefetcher.py
@@ -62,6 +62,9 @@ class RubyPrefetcher(SimObject):
     page_shift = Param.UInt32(
         12, "Number of bits to mask to get a page number"
     )
+    block_size = Param.UInt32(
+        "Size of block to prefetch, usually cache line size"
+    )
 
 
 class Prefetcher(RubyPrefetcher):
diff --git a/src/mem/ruby/structures/RubyPrefetcherProxy.cc b/src/mem/ruby/structures/RubyPrefetcherProxy.cc
index 2a29fbc88e..a6fed8258c 100644
--- a/src/mem/ruby/structures/RubyPrefetcherProxy.cc
+++ b/src/mem/ruby/structures/RubyPrefetcherProxy.cc
@@ -66,7 +66,7 @@ RubyPrefetcherProxy::RubyPrefetcherProxy(AbstractController* _parent,
         prefetcher->setParentInfo(
             cacheCntrl->params().system,
             cacheCntrl->getProbeManager(),
-            RubySystem::getBlockSizeBytes());
+            cacheCntrl->m_ruby_system->getBlockSizeBytes());
     }
 }
 
@@ -112,7 +112,7 @@ RubyPrefetcherProxy::issuePrefetch()
 
         if (pkt) {
             DPRINTF(HWPrefetch, "Next prefetch ready %s\n", pkt->print());
-            unsigned blk_size = RubySystem::getBlockSizeBytes();
+            unsigned blk_size = cacheCntrl->m_ruby_system->getBlockSizeBytes();
             Addr line_addr = pkt->getBlockAddr(blk_size);
 
             if (issuedPfPkts.count(line_addr) == 0) {
@@ -126,6 +126,8 @@ RubyPrefetcherProxy::issuePrefetch()
 
                 std::shared_ptr<RubyRequest> msg =
                     std::make_shared<RubyRequest>(cacheCntrl->clockEdge(),
+                                                  blk_size,
+                                                  cacheCntrl->m_ruby_system,
                                                   pkt->getAddr(),
                                                   blk_size,
                                                   0, // pc
@@ -136,7 +138,10 @@ RubyPrefetcherProxy::issuePrefetch()
 
                 // enqueue request into prefetch queue to the cache
                 pfQueue->enqueue(msg, cacheCntrl->clockEdge(),
-                                    cacheCntrl->cyclesToTicks(Cycles(1)));
+                                 cacheCntrl->cyclesToTicks(Cycles(1)),
+                                 cacheCntrl->m_ruby_system->getRandomization(),
+                                 cacheCntrl->m_ruby_system->getWarmupEnabled()
+                                );
 
                 // track all pending PF requests
                 issuedPfPkts[line_addr] = pkt;
@@ -230,5 +235,19 @@ RubyPrefetcherProxy::regProbePoints()
             cacheCntrl->getProbeManager(), "Data Update");
 }
 
+Addr
+RubyPrefetcherProxy::makeLineAddress(Addr addr) const
+{
+    return ruby::makeLineAddress(addr,
+                           cacheCntrl->m_ruby_system->getBlockSizeBits());
+}
+
+Addr
+RubyPrefetcherProxy::getOffset(Addr addr) const
+{
+    return ruby::getOffset(addr,
+                           cacheCntrl->m_ruby_system->getBlockSizeBits());
+}
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/structures/RubyPrefetcherProxy.hh b/src/mem/ruby/structures/RubyPrefetcherProxy.hh
index 34c40154b6..e7c044edf8 100644
--- a/src/mem/ruby/structures/RubyPrefetcherProxy.hh
+++ b/src/mem/ruby/structures/RubyPrefetcherProxy.hh
@@ -142,6 +142,9 @@ class RubyPrefetcherProxy : public CacheAccessor, public Named
      */
     ProbePointArg<CacheDataUpdateProbeArg> *ppDataUpdate;
 
+    Addr makeLineAddress(Addr addr) const;
+    Addr getOffset(Addr addr) const;
+
   public:
 
     /** Accessor functions */
diff --git a/src/mem/ruby/structures/TBETable.hh b/src/mem/ruby/structures/TBETable.hh
index 9030d52d9f..72770ce42f 100644
--- a/src/mem/ruby/structures/TBETable.hh
+++ b/src/mem/ruby/structures/TBETable.hh
@@ -70,6 +70,8 @@ class TBETable
         return (m_number_of_TBEs - m_map.size()) >= n;
     }
 
+    void setBlockSize(const int block_size) { m_block_size = block_size; }
+
     ENTRY *getNullEntry();
     ENTRY *lookup(Addr address);
 
@@ -85,7 +87,8 @@ class TBETable
     std::unordered_map<Addr, ENTRY> m_map;
 
   private:
-    int m_number_of_TBEs;
+    int m_number_of_TBEs = 0;
+    int m_block_size = 0;
 };
 
 template<class ENTRY>
@@ -101,7 +104,7 @@ template<class ENTRY>
 inline bool
 TBETable<ENTRY>::isPresent(Addr address) const
 {
-    assert(address == makeLineAddress(address));
+    assert(address == makeLineAddress(address, floorLog2(m_block_size)));
     assert(m_map.size() <= m_number_of_TBEs);
     return !!m_map.count(address);
 }
@@ -112,7 +115,8 @@ TBETable<ENTRY>::allocate(Addr address)
 {
     assert(!isPresent(address));
     assert(m_map.size() < m_number_of_TBEs);
-    m_map[address] = ENTRY();
+    assert(m_block_size > 0);
+    m_map.emplace(address, ENTRY(m_block_size));
 }
 
 template<class ENTRY>
diff --git a/src/mem/ruby/structures/TimerTable.cc b/src/mem/ruby/structures/TimerTable.cc
index f8f24dbfc0..a9ce92252e 100644
--- a/src/mem/ruby/structures/TimerTable.cc
+++ b/src/mem/ruby/structures/TimerTable.cc
@@ -70,7 +70,7 @@ TimerTable::nextAddress() const
 void
 TimerTable::set(Addr address, Tick ready_time)
 {
-    assert(address == makeLineAddress(address));
+    assert(address == makeLineAddress(address, m_block_size_bits));
     assert(!m_map.count(address));
 
     m_map[address] = ready_time;
@@ -87,7 +87,7 @@ TimerTable::set(Addr address, Tick ready_time)
 void
 TimerTable::unset(Addr address)
 {
-    assert(address == makeLineAddress(address));
+    assert(address == makeLineAddress(address, m_block_size_bits));
     assert(m_map.count(address));
     m_map.erase(address);
 
diff --git a/src/mem/ruby/structures/TimerTable.hh b/src/mem/ruby/structures/TimerTable.hh
index e676359fd4..92c485ab57 100644
--- a/src/mem/ruby/structures/TimerTable.hh
+++ b/src/mem/ruby/structures/TimerTable.hh
@@ -48,6 +48,12 @@ class TimerTable
   public:
     TimerTable();
 
+    void
+    setBlockSize(int block_size)
+    {
+        m_block_size_bits = floorLog2(block_size);
+    }
+
     void
     setConsumer(Consumer* consumer_ptr)
     {
@@ -88,6 +94,8 @@ class TimerTable
     //! Consumer to signal a wakeup()
     Consumer* m_consumer_ptr;
 
+    int m_block_size_bits = 0;
+
     std::string m_name;
 };
 
diff --git a/src/mem/ruby/structures/WireBuffer.cc b/src/mem/ruby/structures/WireBuffer.cc
index a839fe7cc7..3ebbe2a305 100644
--- a/src/mem/ruby/structures/WireBuffer.cc
+++ b/src/mem/ruby/structures/WireBuffer.cc
@@ -36,7 +36,6 @@
 
 #include "base/cprintf.hh"
 #include "base/stl_helpers.hh"
-#include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
 {
@@ -74,7 +73,8 @@ WireBuffer::~WireBuffer()
 }
 
 void
-WireBuffer::enqueue(MsgPtr message, Tick current_time, Tick delta)
+WireBuffer::enqueue(MsgPtr message, Tick current_time, Tick delta,
+                    bool /*ruby_is_random*/, bool /*ruby_warmup*/)
 {
     m_msg_counter++;
     Tick arrival_time = current_time + delta;
diff --git a/src/mem/ruby/structures/WireBuffer.hh b/src/mem/ruby/structures/WireBuffer.hh
index b26043b09a..75dfc154c8 100644
--- a/src/mem/ruby/structures/WireBuffer.hh
+++ b/src/mem/ruby/structures/WireBuffer.hh
@@ -78,7 +78,10 @@ class WireBuffer : public SimObject
     void setDescription(const std::string& name) { m_description = name; };
     std::string getDescription() { return m_description; };
 
-    void enqueue(MsgPtr message, Tick current_time, Tick delta);
+    // ruby_is_random and ruby_warmup are not used, but this method signature
+    // must match that of MessageBuffer.
+    void enqueue(MsgPtr message, Tick current_time, Tick delta,
+                 bool ruby_is_random = false, bool ruby_warmup = false);
     void dequeue(Tick current_time);
     const Message* peek();
     void recycle(Tick current_time, Tick recycle_latency);
diff --git a/src/mem/ruby/structures/WireBuffer.py b/src/mem/ruby/structures/WireBuffer.py
index ca67e7cb31..8cb2cfe4d6 100644
--- a/src/mem/ruby/structures/WireBuffer.py
+++ b/src/mem/ruby/structures/WireBuffer.py
@@ -35,5 +35,3 @@ class RubyWireBuffer(SimObject):
     type = "RubyWireBuffer"
     cxx_class = "gem5::ruby::WireBuffer"
     cxx_header = "mem/ruby/structures/WireBuffer.hh"
-
-    ruby_system = Param.RubySystem(Parent.any, "")
diff --git a/src/mem/ruby/system/CacheRecorder.cc b/src/mem/ruby/system/CacheRecorder.cc
index 3326856849..426c604cb0 100644
--- a/src/mem/ruby/system/CacheRecorder.cc
+++ b/src/mem/ruby/system/CacheRecorder.cc
@@ -49,31 +49,25 @@ TraceRecord::print(std::ostream& out) const
         << m_type << ", Time: " << m_time << "]";
 }
 
-CacheRecorder::CacheRecorder()
-    : m_uncompressed_trace(NULL),
-      m_uncompressed_trace_size(0),
-      m_block_size_bytes(RubySystem::getBlockSizeBytes())
-{
-}
-
 CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace,
                              uint64_t uncompressed_trace_size,
                              std::vector<RubyPort*>& ruby_port_map,
-                             uint64_t block_size_bytes)
+                             uint64_t trace_block_size_bytes,
+                             uint64_t system_block_size_bytes)
     : m_uncompressed_trace(uncompressed_trace),
       m_uncompressed_trace_size(uncompressed_trace_size),
       m_ruby_port_map(ruby_port_map), m_bytes_read(0),
       m_records_read(0), m_records_flushed(0),
-      m_block_size_bytes(block_size_bytes)
+      m_block_size_bytes(trace_block_size_bytes)
 
 {
     if (m_uncompressed_trace != NULL) {
-        if (m_block_size_bytes < RubySystem::getBlockSizeBytes()) {
+        if (m_block_size_bytes < system_block_size_bytes) {
             // Block sizes larger than when the trace was recorded are not
             // supported, as we cannot reliably turn accesses to smaller blocks
             // into larger ones.
             panic("Recorded cache block size (%d) < current block size (%d) !!",
-                    m_block_size_bytes, RubySystem::getBlockSizeBytes());
+                    m_block_size_bytes, system_block_size_bytes);
         }
     }
 }
@@ -125,7 +119,7 @@ CacheRecorder::enqueueNextFetchRequest()
         DPRINTF(RubyCacheTrace, "Issuing %s\n", *traceRecord);
 
         for (int rec_bytes_read = 0; rec_bytes_read < m_block_size_bytes;
-                rec_bytes_read += RubySystem::getBlockSizeBytes()) {
+                rec_bytes_read += m_block_size_bytes) {
             RequestPtr req;
             MemCmd::Command requestType;
 
@@ -133,19 +127,19 @@ CacheRecorder::enqueueNextFetchRequest()
                 requestType = MemCmd::ReadReq;
                 req = std::make_shared<Request>(
                     traceRecord->m_data_address + rec_bytes_read,
-                    RubySystem::getBlockSizeBytes(), 0,
+                    m_block_size_bytes, 0,
                                     Request::funcRequestorId);
             }   else if (traceRecord->m_type == RubyRequestType_IFETCH) {
                 requestType = MemCmd::ReadReq;
                 req = std::make_shared<Request>(
                         traceRecord->m_data_address + rec_bytes_read,
-                        RubySystem::getBlockSizeBytes(),
+                        m_block_size_bytes,
                         Request::INST_FETCH, Request::funcRequestorId);
             }   else {
                 requestType = MemCmd::WriteReq;
                 req = std::make_shared<Request>(
                     traceRecord->m_data_address + rec_bytes_read,
-                    RubySystem::getBlockSizeBytes(), 0,
+                    m_block_size_bytes, 0,
                                 Request::funcRequestorId);
             }
 
diff --git a/src/mem/ruby/system/CacheRecorder.hh b/src/mem/ruby/system/CacheRecorder.hh
index 021da6a4da..982e8b0592 100644
--- a/src/mem/ruby/system/CacheRecorder.hh
+++ b/src/mem/ruby/system/CacheRecorder.hh
@@ -73,13 +73,15 @@ class TraceRecord
 class CacheRecorder
 {
   public:
-    CacheRecorder();
-    ~CacheRecorder();
-
+    // Construction requires block size.
+    CacheRecorder() = delete;
     CacheRecorder(uint8_t* uncompressed_trace,
                   uint64_t uncompressed_trace_size,
                   std::vector<RubyPort*>& ruby_port_map,
-                  uint64_t block_size_bytes);
+                  uint64_t trace_block_size_bytes,
+                  uint64_t system_block_size_bytes);
+    ~CacheRecorder();
+
     void addRecord(int cntrl, Addr data_addr, Addr pc_addr,
                    RubyRequestType type, Tick time, DataBlock& data);
 
diff --git a/src/mem/ruby/system/DMASequencer.cc b/src/mem/ruby/system/DMASequencer.cc
index aa3fc66814..cd9d62d12a 100644
--- a/src/mem/ruby/system/DMASequencer.cc
+++ b/src/mem/ruby/system/DMASequencer.cc
@@ -73,7 +73,7 @@ void
 DMASequencer::init()
 {
     RubyPort::init();
-    m_data_block_mask = mask(RubySystem::getBlockSizeBits());
+    m_data_block_mask = mask(m_ruby_system->getBlockSizeBits());
 }
 
 RequestStatus
@@ -110,8 +110,10 @@ DMASequencer::makeRequest(PacketPtr pkt)
 
     DPRINTF(RubyDma, "DMA req created: addr %p, len %d\n", line_addr, len);
 
+    int blk_size = m_ruby_system->getBlockSizeBytes();
+
     std::shared_ptr<SequencerMsg> msg =
-        std::make_shared<SequencerMsg>(clockEdge());
+        std::make_shared<SequencerMsg>(clockEdge(), blk_size, m_ruby_system);
     msg->getPhysicalAddress() = paddr;
     msg->getLineAddress() = line_addr;
 
@@ -145,8 +147,8 @@ DMASequencer::makeRequest(PacketPtr pkt)
 
     int offset = paddr & m_data_block_mask;
 
-    msg->getLen() = (offset + len) <= RubySystem::getBlockSizeBytes() ?
-        len : RubySystem::getBlockSizeBytes() - offset;
+    msg->getLen() = (offset + len) <= m_ruby_system->getBlockSizeBytes() ?
+        len : m_ruby_system->getBlockSizeBytes() - offset;
 
     if (write && (data != NULL)) {
         if (active_request.data != NULL) {
@@ -157,7 +159,8 @@ DMASequencer::makeRequest(PacketPtr pkt)
     m_outstanding_count++;
 
     assert(m_mandatory_q_ptr != NULL);
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1)));
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1)),
+        m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());
     active_request.bytes_issued += msg->getLen();
 
     return RequestStatus_Issued;
@@ -183,8 +186,10 @@ DMASequencer::issueNext(const Addr& address)
         return;
     }
 
+    int blk_size = m_ruby_system->getBlockSizeBytes();
+
     std::shared_ptr<SequencerMsg> msg =
-        std::make_shared<SequencerMsg>(clockEdge());
+        std::make_shared<SequencerMsg>(clockEdge(), blk_size, m_ruby_system);
     msg->getPhysicalAddress() = active_request.start_paddr +
                                 active_request.bytes_completed;
 
@@ -196,9 +201,9 @@ DMASequencer::issueNext(const Addr& address)
 
     msg->getLen() =
         (active_request.len -
-         active_request.bytes_completed < RubySystem::getBlockSizeBytes() ?
+         active_request.bytes_completed < m_ruby_system->getBlockSizeBytes() ?
          active_request.len - active_request.bytes_completed :
-         RubySystem::getBlockSizeBytes());
+         m_ruby_system->getBlockSizeBytes());
 
     if (active_request.write) {
         msg->getDataBlk().
@@ -207,7 +212,8 @@ DMASequencer::issueNext(const Addr& address)
     }
 
     assert(m_mandatory_q_ptr != NULL);
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1)));
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1)),
+        m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());
     active_request.bytes_issued += msg->getLen();
     DPRINTF(RubyDma,
             "DMA request bytes issued %d, bytes completed %d, total len %d\n",
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index 072c63efd7..4d66dc6c1b 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -142,8 +142,8 @@ UncoalescedTable::updateResources()
             // are accessed directly using the makeRequest() command
             // instead of accessing through the port. This makes
             // sending tokens through the port unnecessary
-            if (!RubySystem::getWarmupEnabled()
-                    && !RubySystem::getCooldownEnabled()) {
+            if (!coalescer->getRubySystem()->getWarmupEnabled() &&
+                !coalescer->getRubySystem()->getCooldownEnabled()) {
                 if (reqTypeMap[seq_num] != RubyRequestType_FLUSH) {
                     DPRINTF(GPUCoalescer,
                             "Returning token seqNum %d\n", seq_num);
@@ -177,7 +177,7 @@ UncoalescedTable::printRequestTable(std::stringstream& ss)
     ss << "Listing pending packets from " << instMap.size() << " instructions";
 
     for (auto& inst : instMap) {
-        ss << "\tAddr: " << printAddress(inst.first) << " with "
+        ss << "\tAddr: " << coalescer->printAddress(inst.first) << " with "
            << inst.second.size() << " pending packets" << std::endl;
     }
 }
@@ -590,7 +590,7 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
         // When the Ruby system is cooldown phase, the requests come from
         // the cache recorder. These requests do not get coalesced and
         // do not return valid data.
-        if (RubySystem::getCooldownEnabled())
+        if (m_ruby_system->getCooldownEnabled())
             continue;
 
         if (pkt->getPtr<uint8_t>()) {
@@ -700,8 +700,8 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
         // When Ruby is in warmup or cooldown phase, the requests come from
         // the cache recorder. There is no dynamic instruction associated
         // with these requests either
-        if (!RubySystem::getWarmupEnabled()
-                && !RubySystem::getCooldownEnabled()) {
+        if (!m_ruby_system->getWarmupEnabled()
+                && !m_ruby_system->getCooldownEnabled()) {
             if (!m_usingRubyTester) {
                 num_packets = 0;
                 for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
@@ -985,8 +985,8 @@ GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
         // When Ruby is in warmup or cooldown phase, the requests come
         // from the cache recorder. They do not track which port to use
         // and do not need to send the response back
-        if (!RubySystem::getWarmupEnabled()
-                && !RubySystem::getCooldownEnabled()) {
+        if (!m_ruby_system->getWarmupEnabled()
+                && !m_ruby_system->getCooldownEnabled()) {
             RubyPort::SenderState *ss =
                 safe_cast<RubyPort::SenderState *>(pkt->senderState);
             MemResponsePort *port = ss->port;
@@ -1015,9 +1015,9 @@ GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
     }
 
     RubySystem *rs = m_ruby_system;
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         rs->m_cache_recorder->enqueueNextFetchRequest();
-    } else if (RubySystem::getCooldownEnabled()) {
+    } else if (m_ruby_system->getCooldownEnabled()) {
         rs->m_cache_recorder->enqueueNextFlushRequest();
     } else {
         testDrainComplete();
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index 42efe41cb7..08412baad1 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -341,6 +341,8 @@ class GPUCoalescer : public RubyPort
 
     void insertKernel(int wavefront_id, PacketPtr pkt);
 
+    RubySystem *getRubySystem() { return m_ruby_system; }
+
     GMTokenPort& getGMTokenPort() { return gmTokenPort; }
 
     statistics::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 2630a6a27c..127f3c7802 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -326,6 +326,8 @@ RubyPort::MemResponsePort::recvAtomic(PacketPtr pkt)
         panic("Ruby supports atomic accesses only in noncaching mode\n");
     }
 
+    RubySystem *rs = owner.m_ruby_system;
+
     // Check for pio requests and directly send them to the dedicated
     // pio port.
     if (pkt->cmd != MemCmd::MemSyncReq) {
@@ -343,12 +345,11 @@ RubyPort::MemResponsePort::recvAtomic(PacketPtr pkt)
             return owner.ticksToCycles(req_ticks);
         }
 
-        assert(getOffset(pkt->getAddr()) + pkt->getSize() <=
-               RubySystem::getBlockSizeBytes());
+        assert(owner.getOffset(pkt->getAddr()) + pkt->getSize() <=
+               rs->getBlockSizeBytes());
     }
 
     // Find the machine type of memory controller interface
-    RubySystem *rs = owner.m_ruby_system;
     static int mem_interface_type = -1;
     if (mem_interface_type == -1) {
         if (rs->m_abstract_controls[MachineType_Directory].size() != 0) {
@@ -404,7 +405,7 @@ RubyPort::MemResponsePort::recvFunctional(PacketPtr pkt)
     }
 
     assert(pkt->getAddr() + pkt->getSize() <=
-           makeLineAddress(pkt->getAddr()) + RubySystem::getBlockSizeBytes());
+           owner.makeLineAddress(pkt->getAddr()) + rs->getBlockSizeBytes());
 
     if (access_backing_store) {
         // The attached physmem contains the official version of data.
@@ -501,7 +502,7 @@ RubyPort::ruby_stale_translation_callback(Addr txnId)
     // assumed they will not be modified or deleted by receivers.
     // TODO: should this really be using funcRequestorId?
     auto request = std::make_shared<Request>(
-        0, RubySystem::getBlockSizeBytes(), Request::TLBI_EXT_SYNC,
+        0, m_ruby_system->getBlockSizeBytes(), Request::TLBI_EXT_SYNC,
         Request::funcRequestorId);
     // Store the txnId in extraData instead of the address
     request->setExtraData(txnId);
@@ -701,7 +702,7 @@ RubyPort::ruby_eviction_callback(Addr address)
     // assumed they will not be modified or deleted by receivers.
     // TODO: should this really be using funcRequestorId?
     auto request = std::make_shared<Request>(
-        address, RubySystem::getBlockSizeBytes(), 0,
+        address, m_ruby_system->getBlockSizeBytes(), 0,
         Request::funcRequestorId);
 
     // Use a single packet to signal all snooping ports of the invalidation.
@@ -739,5 +740,23 @@ RubyPort::functionalWrite(Packet *func_pkt)
     return num_written;
 }
 
+Addr
+RubyPort::getOffset(Addr addr) const
+{
+    return ruby::getOffset(addr, m_ruby_system->getBlockSizeBits());
+}
+
+Addr
+RubyPort::makeLineAddress(Addr addr) const
+{
+    return ruby::makeLineAddress(addr, m_ruby_system->getBlockSizeBits());
+}
+
+std::string
+RubyPort::printAddress(Addr addr) const
+{
+    return ruby::printAddress(addr, m_ruby_system->getBlockSizeBits());
+}
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh
index 66fe0a7686..39535930b3 100644
--- a/src/mem/ruby/system/RubyPort.hh
+++ b/src/mem/ruby/system/RubyPort.hh
@@ -181,6 +181,11 @@ class RubyPort : public ClockedObject
 
     virtual int functionalWrite(Packet *func_pkt);
 
+    // Helper methods for commonly used functions called in common/address.hh
+    Addr getOffset(Addr addr) const;
+    Addr makeLineAddress(Addr addr) const;
+    std::string printAddress(Addr addr) const;
+
   protected:
     void trySendRetries();
     void ruby_hit_callback(PacketPtr pkt);
diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc
index 21062eac14..fd7b262cb1 100644
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -66,15 +66,8 @@ namespace gem5
 namespace ruby
 {
 
-bool RubySystem::m_randomization;
-uint32_t RubySystem::m_block_size_bytes;
-uint32_t RubySystem::m_block_size_bits;
-uint32_t RubySystem::m_memory_size_bits;
-bool RubySystem::m_warmup_enabled = false;
 // To look forward to allowing multiple RubySystem instances, track the number
 // of RubySystems that need to be warmed up on checkpoint restore.
-unsigned RubySystem::m_systems_to_warmup = 0;
-bool RubySystem::m_cooldown_enabled = false;
 
 RubySystem::RubySystem(const Params &p)
     : ClockedObject(p), m_access_backing_store(p.access_backing_store),
@@ -212,8 +205,8 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
 
     // Create the CacheRecorder and record the cache trace
     m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
-                                         ruby_port_map,
-                                         block_size_bytes);
+                                         ruby_port_map, block_size_bytes,
+                                         m_block_size_bytes);
 }
 
 void
@@ -331,7 +324,7 @@ RubySystem::serialize(CheckpointOut &cp) const
     // Store the cache-block size, so we are able to restore on systems
     // with a different cache-block size. CacheRecorder depends on the
     // correct cache-block size upon unserializing.
-    uint64_t block_size_bytes = getBlockSizeBytes();
+    uint64_t block_size_bytes = m_block_size_bytes;
     SERIALIZE_SCALAR(block_size_bytes);
 
     // Check that there's a valid trace to use.  If not, then memory won't
@@ -416,7 +409,6 @@ RubySystem::unserialize(CheckpointIn &cp)
     readCompressedTrace(cache_trace_file, uncompressed_trace,
                         cache_trace_size);
     m_warmup_enabled = true;
-    m_systems_to_warmup++;
 
     // Create the cache recorder that will hang around until startup.
     makeCacheRecorder(uncompressed_trace, cache_trace_size, block_size_bytes);
@@ -467,10 +459,7 @@ RubySystem::startup()
 
         delete m_cache_recorder;
         m_cache_recorder = NULL;
-        m_systems_to_warmup--;
-        if (m_systems_to_warmup == 0) {
-            m_warmup_enabled = false;
-        }
+        m_warmup_enabled = false;
 
         // Restore eventq head
         eventq->replaceHead(eventq_head);
@@ -509,7 +498,7 @@ bool
 RubySystem::functionalRead(PacketPtr pkt)
 {
     Addr address(pkt->getAddr());
-    Addr line_address = makeLineAddress(address);
+    Addr line_address = makeLineAddress(address, m_block_size_bits);
 
     AccessPermission access_perm = AccessPermission_NotPresent;
 
@@ -625,7 +614,7 @@ bool
 RubySystem::functionalRead(PacketPtr pkt)
 {
     Addr address(pkt->getAddr());
-    Addr line_address = makeLineAddress(address);
+    Addr line_address = makeLineAddress(address, m_block_size_bits);
 
     DPRINTF(RubySystem, "Functional Read request for %#x\n", address);
 
@@ -726,7 +715,7 @@ bool
 RubySystem::functionalWrite(PacketPtr pkt)
 {
     Addr addr(pkt->getAddr());
-    Addr line_addr = makeLineAddress(addr);
+    Addr line_addr = makeLineAddress(addr, m_block_size_bits);
     AccessPermission access_perm = AccessPermission_NotPresent;
 
     DPRINTF(RubySystem, "Functional Write request for %#x\n", addr);
diff --git a/src/mem/ruby/system/RubySystem.hh b/src/mem/ruby/system/RubySystem.hh
index e16d699204..7e18770230 100644
--- a/src/mem/ruby/system/RubySystem.hh
+++ b/src/mem/ruby/system/RubySystem.hh
@@ -68,12 +68,12 @@ class RubySystem : public ClockedObject
     ~RubySystem();
 
     // config accessors
-    static int getRandomization() { return m_randomization; }
-    static uint32_t getBlockSizeBytes() { return m_block_size_bytes; }
-    static uint32_t getBlockSizeBits() { return m_block_size_bits; }
-    static uint32_t getMemorySizeBits() { return m_memory_size_bits; }
-    static bool getWarmupEnabled() { return m_warmup_enabled; }
-    static bool getCooldownEnabled() { return m_cooldown_enabled; }
+    int getRandomization() { return m_randomization; }
+    uint32_t getBlockSizeBytes() { return m_block_size_bytes; }
+    uint32_t getBlockSizeBits() { return m_block_size_bits; }
+    uint32_t getMemorySizeBits() { return m_memory_size_bits; }
+    bool getWarmupEnabled() { return m_warmup_enabled; }
+    bool getCooldownEnabled() { return m_cooldown_enabled; }
 
     memory::SimpleMemory *getPhysMem() { return m_phys_mem; }
     Cycles getStartCycle() { return m_start_cycle; }
@@ -134,14 +134,13 @@ class RubySystem : public ClockedObject
     void processRubyEvent();
   private:
     // configuration parameters
-    static bool m_randomization;
-    static uint32_t m_block_size_bytes;
-    static uint32_t m_block_size_bits;
-    static uint32_t m_memory_size_bits;
+    bool m_randomization;
+    uint32_t m_block_size_bytes;
+    uint32_t m_block_size_bits;
+    uint32_t m_memory_size_bits;
 
-    static bool m_warmup_enabled;
-    static unsigned m_systems_to_warmup;
-    static bool m_cooldown_enabled;
+    bool m_warmup_enabled = false;
+    bool m_cooldown_enabled = false;
     memory::SimpleMemory *m_phys_mem;
     const bool m_access_backing_store;
 
@@ -158,6 +157,11 @@ class RubySystem : public ClockedObject
     Profiler* m_profiler;
     CacheRecorder* m_cache_recorder;
     std::vector<std::map<uint32_t, AbstractController *> > m_abstract_controls;
+    std::map<MachineType, uint32_t> m_num_controllers;
+
+    // These are auto-generated by SLICC based on the built protocol.
+    int MachineType_base_count(const MachineType& obj);
+    int MachineType_base_number(const MachineType& obj);
 };
 
 } // namespace ruby
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 4b0c6a239c..e2f49f5dff 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -73,6 +73,8 @@ Sequencer::Sequencer(const Params &p)
 {
     m_outstanding_count = 0;
 
+    m_ruby_system = p.ruby_system;
+
     m_dataCache_ptr = p.dcache;
     m_max_outstanding_requests = p.max_outstanding_requests;
     m_deadlock_threshold = p.deadlock_threshold;
@@ -726,7 +728,7 @@ Sequencer::hitCallback(SequencerRequest* srequest, DataBlock& data,
                          printAddress(request_address));
 
     // update the data unless it is a non-data-carrying flush
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         data.setData(pkt);
     } else if (!pkt->isFlush()) {
         if ((type == RubyRequestType_LD) ||
@@ -782,11 +784,11 @@ Sequencer::hitCallback(SequencerRequest* srequest, DataBlock& data,
     }
 
     RubySystem *rs = m_ruby_system;
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         assert(pkt->req);
         delete pkt;
         rs->m_cache_recorder->enqueueNextFetchRequest();
-    } else if (RubySystem::getCooldownEnabled()) {
+    } else if (m_ruby_system->getCooldownEnabled()) {
         delete pkt;
         rs->m_cache_recorder->enqueueNextFlushRequest();
     } else {
@@ -852,8 +854,8 @@ Sequencer::completeHitCallback(std::vector<PacketPtr> & mylist)
         // When Ruby is in warmup or cooldown phase, the requests come
         // from the cache recorder. They do not track which port to use
         // and do not need to send the response back
-        if (!RubySystem::getWarmupEnabled()
-                && !RubySystem::getCooldownEnabled()) {
+        if (!m_ruby_system->getWarmupEnabled()
+                && !m_ruby_system->getCooldownEnabled()) {
             RubyPort::SenderState *ss =
                 safe_cast<RubyPort::SenderState *>(pkt->senderState);
             MemResponsePort *port = ss->port;
@@ -873,9 +875,9 @@ Sequencer::completeHitCallback(std::vector<PacketPtr> & mylist)
     }
 
     RubySystem *rs = m_ruby_system;
-    if (RubySystem::getWarmupEnabled()) {
+    if (m_ruby_system->getWarmupEnabled()) {
         rs->m_cache_recorder->enqueueNextFetchRequest();
-    } else if (RubySystem::getCooldownEnabled()) {
+    } else if (m_ruby_system->getCooldownEnabled()) {
         rs->m_cache_recorder->enqueueNextFlushRequest();
     } else {
         testDrainComplete();
@@ -910,14 +912,16 @@ Sequencer::invL1()
         // Evict Read-only data
         RubyRequestType request_type = RubyRequestType_REPLACEMENT;
         std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
-            clockEdge(), addr, 0, 0,
-            request_type, RubyAccessMode_Supervisor,
+            clockEdge(), m_ruby_system->getBlockSizeBytes(), m_ruby_system,
+            addr, 0, 0, request_type, RubyAccessMode_Supervisor,
             nullptr);
         DPRINTF(RubySequencer, "Evicting addr 0x%x\n", addr);
         assert(m_mandatory_q_ptr != NULL);
         Tick latency = cyclesToTicks(
             m_controller->mandatoryQueueLatency(request_type));
-        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency,
+                                   m_ruby_system->getRandomization(),
+                                   m_ruby_system->getWarmupEnabled());
         m_num_pending_invs++;
     }
     DPRINTF(RubySequencer,
@@ -1080,11 +1084,14 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
         pc = pkt->req->getPC();
     }
 
+    int blk_size = m_ruby_system->getBlockSizeBytes();
+
     // check if the packet has data as for example prefetch and flush
     // requests do not
     std::shared_ptr<RubyRequest> msg;
     if (pkt->req->isMemMgmt()) {
-        msg = std::make_shared<RubyRequest>(clockEdge(),
+        msg = std::make_shared<RubyRequest>(clockEdge(), blk_size,
+                                            m_ruby_system,
                                             pc, secondary_type,
                                             RubyAccessMode_Supervisor, pkt,
                                             proc_id, core_id);
@@ -1111,8 +1118,10 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
                     msg->m_tlbiTransactionUid);
         }
     } else {
-        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
-                                            pkt->getSize(), pc, secondary_type,
+        msg = std::make_shared<RubyRequest>(clockEdge(), blk_size,
+                                            m_ruby_system,
+                                            pkt->getAddr(), pkt->getSize(),
+                                            pc, secondary_type,
                                             RubyAccessMode_Supervisor, pkt,
                                             PrefetchBit_No, proc_id, core_id);
 
@@ -1147,7 +1156,9 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
     assert(latency > 0);
 
     assert(m_mandatory_q_ptr != NULL);
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency,
+                               m_ruby_system->getRandomization(),
+                               m_ruby_system->getWarmupEnabled());
 }
 
 template <class KEY, class VALUE>
@@ -1194,7 +1205,7 @@ Sequencer::incrementUnaddressedTransactionCnt()
     // Limit m_unaddressedTransactionCnt to 32 bits,
     // top 32 bits should always be zeroed out
     uint64_t aligned_txid = \
-        m_unaddressedTransactionCnt << RubySystem::getBlockSizeBits();
+        m_unaddressedTransactionCnt << m_ruby_system->getBlockSizeBits();
 
     if (aligned_txid > 0xFFFFFFFFull) {
         m_unaddressedTransactionCnt = 0;
@@ -1206,7 +1217,7 @@ Sequencer::getCurrentUnaddressedTransactionID() const
 {
     return (
         uint64_t(m_version & 0xFFFFFFFF) << 32) |
-        (m_unaddressedTransactionCnt << RubySystem::getBlockSizeBits()
+        (m_unaddressedTransactionCnt << m_ruby_system->getBlockSizeBits()
     );
 }
 
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index 1f60d2638f..ee16d2fe2e 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -254,6 +254,8 @@ class Sequencer : public RubyPort
                                         RubyRequestType primary_type,
                                         RubyRequestType secondary_type);
 
+    RubySystem *m_ruby_system;
+
   private:
     int m_max_outstanding_requests;
 
diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py
index 3f570fb952..0994bb4afe 100644
--- a/src/mem/ruby/system/Sequencer.py
+++ b/src/mem/ruby/system/Sequencer.py
@@ -83,7 +83,7 @@ class RubyPort(ClockedObject):
 
     using_ruby_tester = Param.Bool(False, "")
     no_retry_on_stall = Param.Bool(False, "")
-    ruby_system = Param.RubySystem(Parent.any, "")
+    ruby_system = Param.RubySystem("Parent RubySystem object")
     system = Param.System(Parent.any, "system object")
     support_data_reqs = Param.Bool(True, "data cache requests supported")
     support_inst_reqs = Param.Bool(True, "inst cache requests supported")
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
index 47ceced3a7..67dd88fb2e 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -135,9 +135,9 @@ VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
     // Creating WriteMask that records written bytes
     // and atomic operations. This enables partial writes
     // and partial reads of those writes
-    DataBlock dataBlock;
+    uint32_t blockSize = m_ruby_system->getBlockSizeBytes();
+    DataBlock dataBlock(blockSize);
     dataBlock.clear();
-    uint32_t blockSize = RubySystem::getBlockSizeBytes();
     std::vector<bool> accessMask(blockSize,false);
     std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
     uint32_t tableSize = crequest->getPackets().size();
@@ -159,15 +159,17 @@ VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
     }
     std::shared_ptr<RubyRequest> msg;
     if (pkt->isAtomicOp()) {
-        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
-                              pkt->getSize(), pc, crequest->getRubyType(),
+        msg = std::make_shared<RubyRequest>(clockEdge(), blockSize,
+                              m_ruby_system, pkt->getAddr(), pkt->getSize(),
+                              pc, crequest->getRubyType(),
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
                               dataBlock, atomicOps, crequest->getSeqNum());
     } else {
-        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
-                              pkt->getSize(), pc, crequest->getRubyType(),
+        msg = std::make_shared<RubyRequest>(clockEdge(), blockSize,
+                              m_ruby_system, pkt->getAddr(), pkt->getSize(),
+                              pc, crequest->getRubyType(),
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
@@ -195,7 +197,9 @@ VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
     assert(m_mandatory_q_ptr);
     Tick latency = cyclesToTicks(
         m_controller->mandatoryQueueLatency(crequest->getRubyType()));
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency,
+                               m_ruby_system->getRandomization(),
+                               m_ruby_system->getWarmupEnabled());
 }
 
 void
@@ -241,7 +245,7 @@ VIPERCoalescer::writeCompleteCallback(Addr addr, uint64_t instSeqNum)
         std::remove_if(
             m_writeCompletePktMap[key].begin(),
             m_writeCompletePktMap[key].end(),
-            [addr](PacketPtr writeCompletePkt) -> bool {
+            [this,addr](PacketPtr writeCompletePkt) -> bool {
                 if (makeLineAddress(writeCompletePkt->getAddr()) == addr) {
                     RubyPort::SenderState *ss =
                         safe_cast<RubyPort::SenderState *>
@@ -296,14 +300,15 @@ VIPERCoalescer::invTCP()
         // Evict Read-only data
         RubyRequestType request_type = RubyRequestType_REPLACEMENT;
         std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
-            clockEdge(), addr, 0, 0,
-            request_type, RubyAccessMode_Supervisor,
-            nullptr);
+            clockEdge(), m_ruby_system->getBlockSizeBytes(), m_ruby_system,
+            addr, 0, 0, request_type, RubyAccessMode_Supervisor, nullptr);
         DPRINTF(GPUCoalescer, "Evicting addr 0x%x\n", addr);
         assert(m_mandatory_q_ptr != NULL);
         Tick latency = cyclesToTicks(
             m_controller->mandatoryQueueLatency(request_type));
-        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency,
+                                   m_ruby_system->getRandomization(),
+                                   m_ruby_system->getWarmupEnabled());
         m_num_pending_invs++;
     }
     DPRINTF(GPUCoalescer,
@@ -343,16 +348,17 @@ VIPERCoalescer::invTCC(PacketPtr pkt)
     RubyRequestType request_type = RubyRequestType_InvL2;
 
     std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
-        clockEdge(), addr, 0, 0,
-        request_type, RubyAccessMode_Supervisor,
-        nullptr);
+        clockEdge(), m_ruby_system->getBlockSizeBytes(), m_ruby_system,
+        addr, 0, 0, request_type, RubyAccessMode_Supervisor, nullptr);
 
     DPRINTF(GPUCoalescer, "Sending L2 invalidate to 0x%x\n", addr);
 
     assert(m_mandatory_q_ptr);
     Tick latency = cyclesToTicks(
         m_controller->mandatoryQueueLatency(request_type));
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency,
+                               m_ruby_system->getRandomization(),
+                               m_ruby_system->getWarmupEnabled());
 
     m_pending_invl2s[addr].push_back(pkt);
 }
diff --git a/src/mem/ruby/system/VIPERSequencer.cc b/src/mem/ruby/system/VIPERSequencer.cc
index ac840777d4..b8b806aa9c 100644
--- a/src/mem/ruby/system/VIPERSequencer.cc
+++ b/src/mem/ruby/system/VIPERSequencer.cc
@@ -81,8 +81,8 @@ VIPERSequencer::hitCallback(SequencerRequest* srequest, DataBlock& data,
     // subBlock with the recieved data.  The tester will later access
     // this state.
     assert(!m_usingRubyTester);
-    assert(!RubySystem::getWarmupEnabled());
-    assert(!RubySystem::getCooldownEnabled());
+    assert(!m_ruby_system->getWarmupEnabled());
+    assert(!m_ruby_system->getCooldownEnabled());
     ruby_hit_callback(pkt);
     testDrainComplete();
 }
diff --git a/src/mem/slicc/ast/CheckProbeStatementAST.py b/src/mem/slicc/ast/CheckProbeStatementAST.py
index 10945cfc30..14f6f7e4fa 100644
--- a/src/mem/slicc/ast/CheckProbeStatementAST.py
+++ b/src/mem/slicc/ast/CheckProbeStatementAST.py
@@ -49,7 +49,8 @@ class CheckProbeStatementAST(StatementAST):
     if (m_is_blocking &&
         (m_block_map.count($address_code) == 1) &&
         (m_block_map[$address_code] == &$in_port_code)) {
-            $in_port_code.delayHead(clockEdge(), cyclesToTicks(Cycles(1)));
+            $in_port_code.delayHead(clockEdge(), cyclesToTicks(Cycles(1)),
+                m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());
             continue;
         }
         """
diff --git a/src/mem/slicc/ast/DeferEnqueueingStatementAST.py b/src/mem/slicc/ast/DeferEnqueueingStatementAST.py
index 14b2e48cd3..4bb446aee2 100644
--- a/src/mem/slicc/ast/DeferEnqueueingStatementAST.py
+++ b/src/mem/slicc/ast/DeferEnqueueingStatementAST.py
@@ -68,7 +68,8 @@ class DeferEnqueueingStatementAST(StatementAST):
         # Declare message
         code(
             "std::shared_ptr<${{msg_type.c_ident}}> out_msg = "
-            "std::make_shared<${{msg_type.c_ident}}>(clockEdge());"
+            "std::make_shared<${{msg_type.c_ident}}>(clockEdge(),"
+            "    m_ruby_system->getBlockSizeBytes(), m_ruby_system);"
         )
 
         # The other statements
diff --git a/src/mem/slicc/ast/EnqueueStatementAST.py b/src/mem/slicc/ast/EnqueueStatementAST.py
index c2d47af9ce..b026f6e7a9 100644
--- a/src/mem/slicc/ast/EnqueueStatementAST.py
+++ b/src/mem/slicc/ast/EnqueueStatementAST.py
@@ -76,7 +76,8 @@ class EnqueueStatementAST(StatementAST):
         # Declare message
         code(
             "std::shared_ptr<${{msg_type.c_ident}}> out_msg = "
-            "std::make_shared<${{msg_type.c_ident}}>(clockEdge());"
+            "std::make_shared<${{msg_type.c_ident}}>(clockEdge(), "
+            "    m_ruby_system->getBlockSizeBytes(), m_ruby_system);"
         )
 
         # The other statements
@@ -89,17 +90,21 @@ class EnqueueStatementAST(StatementAST):
                 bypass_strict_fifo_code = self.bypass_strict_fifo.inline(False)
                 code(
                     "(${{self.queue_name.var.code}}).enqueue("
-                    "out_msg, clockEdge(), cyclesToTicks(Cycles($rcode)), $bypass_strict_fifo_code);"
+                    "out_msg, clockEdge(), cyclesToTicks(Cycles($rcode)), "
+                    "m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled(), "
+                    "$bypass_strict_fifo_code);"
                 )
             else:
                 code(
                     "(${{self.queue_name.var.code}}).enqueue("
-                    "out_msg, clockEdge(), cyclesToTicks(Cycles($rcode)));"
+                    "out_msg, clockEdge(), cyclesToTicks(Cycles($rcode)), "
+                    "m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());"
                 )
         else:
             code(
                 "(${{self.queue_name.var.code}}).enqueue(out_msg, "
-                "clockEdge(), cyclesToTicks(Cycles(1)));"
+                "clockEdge(), cyclesToTicks(Cycles(1)),"
+                "m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());"
             )
 
         # End scope
diff --git a/src/mem/slicc/ast/LocalVariableAST.py b/src/mem/slicc/ast/LocalVariableAST.py
index b4ac8f446b..43ab110a67 100644
--- a/src/mem/slicc/ast/LocalVariableAST.py
+++ b/src/mem/slicc/ast/LocalVariableAST.py
@@ -73,6 +73,8 @@ class LocalVariableAST(StatementAST):
             )
         ):
             code += f"{type.c_ident}* {ident}"
+        elif "implicit_ctor" in type:
+            code += f"{type.c_ident} {ident}({type['implicit_ctor']})"
         else:
             code += f"{type.c_ident} {ident}"
         return type
diff --git a/src/mem/slicc/ast/PeekStatementAST.py b/src/mem/slicc/ast/PeekStatementAST.py
index 00edff4e7b..415f4ec465 100644
--- a/src/mem/slicc/ast/PeekStatementAST.py
+++ b/src/mem/slicc/ast/PeekStatementAST.py
@@ -93,7 +93,8 @@ class PeekStatementAST(StatementAST):
     if (m_is_blocking &&
         (m_block_map.count(in_msg_ptr->m_$address_field) == 1) &&
         (m_block_map[in_msg_ptr->m_$address_field] != &$qcode)) {
-            $qcode.delayHead(clockEdge(), cyclesToTicks(Cycles(1)));
+            $qcode.delayHead(clockEdge(), cyclesToTicks(Cycles(1)),
+            m_ruby_system->getRandomization(), m_ruby_system->getWarmupEnabled());
             continue;
     }
             """
diff --git a/src/mem/slicc/symbols/StateMachine.py b/src/mem/slicc/symbols/StateMachine.py
index b523522501..6202d2d239 100644
--- a/src/mem/slicc/symbols/StateMachine.py
+++ b/src/mem/slicc/symbols/StateMachine.py
@@ -352,7 +352,6 @@ class $c_ident : public AbstractController
   public:
     typedef ${c_ident}Params Params;
     $c_ident(const Params &p);
-    static int getNumControllers();
     void init();
 
     MessageBuffer *getMandatoryQueue() const;
@@ -449,9 +448,8 @@ int m_counters[${ident}_State_NUM][${ident}_Event_NUM];
 int m_event_counters[${ident}_Event_NUM];
 bool m_possible[${ident}_State_NUM][${ident}_Event_NUM];
 
-static std::vector<statistics::Vector *> eventVec;
-static std::vector<std::vector<statistics::Vector *> > transVec;
-static int m_num_controllers;
+std::vector<statistics::Vector *> eventVec;
+std::vector<std::vector<statistics::Vector *> > transVec;
 
 // Internal functions
 """
@@ -625,10 +623,6 @@ namespace gem5
 namespace ruby
 {
 
-int $c_ident::m_num_controllers = 0;
-std::vector<statistics::Vector *>  $c_ident::eventVec;
-std::vector<std::vector<statistics::Vector *> >  $c_ident::transVec;
-
 // for adding information to the protocol debug trace
 std::stringstream ${ident}_transitionComment;
 
@@ -644,8 +638,9 @@ $c_ident::$c_ident(const Params &p)
 {
     m_machineID.type = MachineType_${ident};
     m_machineID.num = m_version;
-    m_num_controllers++;
+    p.ruby_system->m_num_controllers[MachineType_${ident}]++;
     p.ruby_system->registerAbstractController(this);
+    m_ruby_system = p.ruby_system;
 
     m_in_ports = $num_in_ports;
 """
@@ -699,7 +694,7 @@ void
 $c_ident::initNetQueues()
 {
     MachineType machine_type = string_to_MachineType("${{self.ident}}");
-    [[maybe_unused]] int base = MachineType_base_number(machine_type);
+    [[maybe_unused]] int base = m_ruby_system->MachineType_base_number(machine_type);
 
 """
         )
@@ -776,6 +771,17 @@ $c_ident::init()
                         comment = f"Type {vtype.ident} default"
                         code('*$vid = ${{vtype["default"]}}; // $comment')
 
+                    # For objects that require knowing the cache line size,
+                    # set the value here.
+                    if vtype.c_ident in ("TBETable"):
+                        block_size_func = "m_ruby_system->getBlockSizeBytes()"
+                        code(f"(*{vid}).setBlockSize({block_size_func});")
+
+        for param in self.config_parameters:
+            if param.type_ast.type.ident == "CacheMemory":
+                assert param.pointer
+                code(f"m_{param.ident}_ptr->setRubySystem(m_ruby_system);")
+
         # Set the prefetchers
         code()
         for prefetcher in self.prefetchers:
@@ -942,7 +948,9 @@ $c_ident::regStats()
                 "${c_ident}." + ${ident}_Event_to_string(event);
             statistics::Vector *t =
                 new statistics::Vector(profilerStatsPtr, stat_name.c_str());
-            t->init(m_num_controllers);
+            int num_controllers =
+                m_ruby_system->m_num_controllers[MachineType_${ident}];
+            t->init(num_controllers);
             t->flags(statistics::pdf | statistics::total |
                 statistics::oneline | statistics::nozero);
 
@@ -961,7 +969,9 @@ $c_ident::regStats()
                     "." + ${ident}_Event_to_string(event);
                 statistics::Vector *t = new statistics::Vector(
                     profilerStatsPtr, stat_name.c_str());
-                t->init(m_num_controllers);
+                int num_controllers =
+                    m_ruby_system->m_num_controllers[MachineType_${ident}];
+                t->init(num_controllers);
                 t->flags(statistics::pdf | statistics::total |
                     statistics::oneline | statistics::nozero);
                 transVec[state].push_back(t);
@@ -1062,9 +1072,12 @@ $c_ident::regStats()
 void
 $c_ident::collateStats()
 {
+    int num_controllers =
+        m_ruby_system->m_num_controllers[MachineType_${ident}];
+
     for (${ident}_Event event = ${ident}_Event_FIRST;
          event < ${ident}_Event_NUM; ++event) {
-        for (unsigned int i = 0; i < m_num_controllers; ++i) {
+        for (unsigned int i = 0; i < num_controllers; ++i) {
             RubySystem *rs = params().ruby_system;
             std::map<uint32_t, AbstractController *>::iterator it =
                      rs->m_abstract_controls[MachineType_${ident}].find(i);
@@ -1080,7 +1093,7 @@ $c_ident::collateStats()
         for (${ident}_Event event = ${ident}_Event_FIRST;
              event < ${ident}_Event_NUM; ++event) {
 
-            for (unsigned int i = 0; i < m_num_controllers; ++i) {
+            for (unsigned int i = 0; i < num_controllers; ++i) {
                 RubySystem *rs = params().ruby_system;
                 std::map<uint32_t, AbstractController *>::iterator it =
                          rs->m_abstract_controls[MachineType_${ident}].find(i);
@@ -1125,12 +1138,6 @@ $c_ident::getTransitionCount(${ident}_State state,
     return m_counters[state][event];
 }
 
-int
-$c_ident::getNumControllers()
-{
-    return m_num_controllers;
-}
-
 MessageBuffer*
 $c_ident::getMandatoryQueue() const
 {
@@ -1181,6 +1188,7 @@ void
 $c_ident::set_cache_entry(${{self.EntryType.c_ident}}*& m_cache_entry_ptr, AbstractCacheEntry* m_new_cache_entry)
 {
   m_cache_entry_ptr = (${{self.EntryType.c_ident}}*)m_new_cache_entry;
+  m_cache_entry_ptr->setRubySystem(m_ruby_system);
 }
 
 void
@@ -1200,6 +1208,7 @@ void
 $c_ident::set_tbe(${{self.TBEType.c_ident}}*& m_tbe_ptr, ${{self.TBEType.c_ident}}* m_new_tbe)
 {
   m_tbe_ptr = m_new_tbe;
+  m_tbe_ptr->setRubySystem(m_ruby_system);
 }
 
 void
diff --git a/src/mem/slicc/symbols/Type.py b/src/mem/slicc/symbols/Type.py
index 535a4165b3..53c8ff877e 100644
--- a/src/mem/slicc/symbols/Type.py
+++ b/src/mem/slicc/symbols/Type.py
@@ -119,6 +119,10 @@ class Type(Symbol):
     def isMessage(self):
         return "message" in self
 
+    @property
+    def isTBE(self):
+        return "tbe" in self
+
     @property
     def isBuffer(self):
         return "buffer" in self
@@ -250,18 +254,54 @@ namespace gem5
 namespace ruby
 {
 
+class RubySystem;
+
 $klass ${{self.c_ident}}$parent
 {
   public:
-    ${{self.c_ident}}
 """,
             klass="class",
         )
 
         if self.isMessage:
-            code("(Tick curTime) : %s(curTime) {" % self["interface"])
+            code(
+                "${{self.c_ident}}(Tick curTime, int blockSize, RubySystem* rs) : %s(curTime, blockSize, rs)"
+                % self["interface"]
+            )
+
+            for dm in self.data_members.values():
+                if dm.real_c_type in ("DataBlock", "WriteMask"):
+                    code(f"\t\t, m_{dm.ident}(blockSize)")
+
+            code("{")
+        elif self.isTBE:
+            code("${{self.c_ident}}(int block_size)")
+
+            ctor_count = 0
+            for dm in self.data_members.values():
+                if dm.real_c_type in ("DataBlock", "WriteMask"):
+                    if ctor_count == 0:
+                        code("\t:")
+                    else:
+                        code("\t, ")
+                    code(f"\t\tm_{dm.ident}(block_size)")
+                    ctor_count += 1
+
+            code("{")
         else:
-            code("()\n\t\t{")
+            code("${{self.c_ident}}()")
+
+            ctor_count = 0
+            for dm in self.data_members.values():
+                if dm.real_c_type in ("DataBlock", "WriteMask"):
+                    if ctor_count == 0:
+                        code("\t:")
+                    else:
+                        code("\t, ")
+                    code(f"\t\tm_{dm.ident}(0)")
+                    ctor_count += 1
+
+            code("{")
 
         code.indent()
         if not self.isGlobal:
@@ -280,6 +320,12 @@ $klass ${{self.c_ident}}$parent
                     code(" // default value of $tid")
                 else:
                     code("// m_$ident has no default")
+
+                # These parts of Messages need RubySystem pointers. For things
+                # like Entry which only store NetDest, RubySystem is not needed.
+                if self.isMessage and dm.real_c_type == "NetDest":
+                    code("// m_$ident requires RubySystem")
+                    code("m_$ident.setRubySystem(rs);")
             code.dedent()
         code("}")
 
@@ -300,21 +346,45 @@ $klass ${{self.c_ident}}$parent
             params = ", ".join(params)
 
             if self.isMessage:
-                params = "const Tick curTime, " + params
+                params = (
+                    "const Tick curTime, const int blockSize, const RubySystem *rs, "
+                    + params
+                )
 
             code("${{self.c_ident}}($params)")
 
             # Call superclass constructor
             if "interface" in self:
                 if self.isMessage:
-                    code('    : ${{self["interface"]}}(curTime)')
+                    code(
+                        '    : ${{self["interface"]}}(curTime, blockSize, rs)'
+                    )
+
+                    for dm in self.data_members.values():
+                        if dm.real_c_type in ("DataBlock", "WriteMask"):
+                            code(f"\t\t, m_{dm.ident}(blockSize)")
                 else:
                     code('    : ${{self["interface"]}}()')
 
+                    for dm in self.data_members.values():
+                        if dm.real_c_type in ("DataBlock", "WriteMask"):
+                            code(f"\t\t, m_{dm.ident}(local_{dm.ident})")
+            else:
+                ctor_count = 0
+                for dm in self.data_members.values():
+                    if dm.real_c_type in ("DataBlock", "WriteMask"):
+                        if ctor_count == 0:
+                            code("\t:")
+                        else:
+                            code("\t, ")
+                        code(f"\t\tm_{dm.ident}(local_{dm.ident})")
+                        ctor_count += 1
+
             code("{")
             code.indent()
             for dm in self.data_members.values():
-                code("m_${{dm.ident}} = local_${{dm.ident}};")
+                if not dm.real_c_type in ("DataBlock", "WriteMask"):
+                    code("m_${{dm.ident}} = local_${{dm.ident}};")
 
             code.dedent()
             code("}")
@@ -342,6 +412,35 @@ clone() const
             )
 
         if not self.isGlobal:
+            # Block size setter for fields that require block size
+            # Intentionally do not begin function name with "set" in case
+            # the user has a field named BlockSize which would conflict
+            # with the method generated below.
+            code("\nvoid initBlockSize(int block_size)")
+            code("{")
+            code("\tblock_size_bits = floorLog2(block_size);")
+
+            needs_block_size = (
+                "DataBlock",
+                "WriteMask",
+                "PersistentTable",
+                "TimerTable",
+                "PerfectCacheMemory",
+            )
+
+            for dm in self.data_members.values():
+                if dm.real_c_type in needs_block_size:
+                    code(f"\tm_{dm.ident}.setBlockSize(block_size);")
+            code("}\n")
+
+            code("\nvoid setRubySystem(RubySystem *ruby_system)")
+            code("{")
+            for dm in self.data_members.values():
+                if dm.real_c_type in ("NetDest"):
+                    code(f"// m_{dm.ident} requires RubySystem")
+                    code(f"\tm_{dm.ident}.setRubySystem(ruby_system);")
+            code("}\n")
+
             # const Get methods for each field
             code("// Const accessors methods for each field")
             for dm in self.data_members.values():
@@ -393,6 +492,9 @@ set${{dm.ident}}(const ${{dm.real_c_type}}& local_${{dm.ident}})
         code("  //private:")
         code.indent()
 
+        # block_size_bits for print methods
+        code("int block_size_bits = 0;")
+
         # Data members for each field
         for dm in self.data_members.values():
             if "abstract" not in dm:
@@ -473,7 +575,7 @@ ${{self.c_ident}}::print(std::ostream& out) const
             if dm.type.c_ident == "Addr":
                 code(
                     """
-out << "${{dm.ident}} = " << printAddress(m_${{dm.ident}}) << " ";"""
+out << "${{dm.ident}} = " << printAddress(m_${{dm.ident}}, block_size_bits) << " ";"""
                 )
             else:
                 code('out << "${{dm.ident}} = " << m_${{dm.ident}} << " ";' "")
@@ -846,7 +948,7 @@ ${{self.c_ident}}_from_base_level(int type)
  * \\return the base number of components for each machine
  */
 int
-${{self.c_ident}}_base_number(const ${{self.c_ident}}& obj)
+RubySystem::${{self.c_ident}}_base_number(const ${{self.c_ident}}& obj)
 {
     int base = 0;
     switch(obj) {
@@ -860,7 +962,7 @@ ${{self.c_ident}}_base_number(const ${{self.c_ident}}& obj)
                 # Check if there is a defined machine with this type
                 if enum.primary:
                     code(
-                        "    base += ${{enum.ident}}_Controller::getNumControllers();"
+                        "\tbase += m_num_controllers[${{self.c_ident}}_${{enum.ident}}];"
                     )
                 else:
                     code("    base += 0;")
@@ -882,7 +984,7 @@ ${{self.c_ident}}_base_number(const ${{self.c_ident}}& obj)
  * \\return the total number of components for each machine
  */
 int
-${{self.c_ident}}_base_count(const ${{self.c_ident}}& obj)
+RubySystem::${{self.c_ident}}_base_count(const ${{self.c_ident}}& obj)
 {
     switch(obj) {
 """
@@ -893,7 +995,7 @@ ${{self.c_ident}}_base_count(const ${{self.c_ident}}& obj)
                 code("case ${{self.c_ident}}_${{enum.ident}}:")
                 if enum.primary:
                     code(
-                        "return ${{enum.ident}}_Controller::getNumControllers();"
+                        "return m_num_controllers[${{self.c_ident}}_${{enum.ident}}];"
                     )
                 else:
                     code("return 0;")
diff --git a/src/python/SConscript b/src/python/SConscript
index 3aed9f03e3..afe786536c 100644
--- a/src/python/SConscript
+++ b/src/python/SConscript
@@ -280,6 +280,7 @@ PySource('gem5.components.processors',
 PySource('gem5.prebuilt', 'gem5/prebuilt/__init__.py')
 PySource('gem5.prebuilt.demo', 'gem5/prebuilt/demo/__init__.py')
 PySource('gem5.prebuilt.demo', 'gem5/prebuilt/demo/x86_demo_board.py')
+PySource('gem5.prebuilt.demo', 'gem5/prebuilt/demo/arm_demo_board.py')
 PySource('gem5.prebuilt.riscvmatched',
     'gem5/prebuilt/riscvmatched/__init__.py')
 PySource('gem5.prebuilt.riscvmatched',
diff --git a/src/python/gem5/components/boards/abstract_board.py b/src/python/gem5/components/boards/abstract_board.py
index 83ca32d9c0..cd6f559937 100644
--- a/src/python/gem5/components/boards/abstract_board.py
+++ b/src/python/gem5/components/boards/abstract_board.py
@@ -41,6 +41,7 @@ from m5.objects import (
     ClockDomain,
     IOXBar,
     Port,
+    Root,
     SrcClockDomain,
     System,
     VoltageDomain,
@@ -117,12 +118,6 @@ class AbstractBoard:
         # Simulator module.
         self._checkpoint = None
 
-        # Setup the board and memory system's memory ranges.
-        self._setup_memory_ranges()
-
-        # Setup board properties unique to the board being constructed.
-        self._setup_board()
-
         # A private variable to record whether `_connect_things` has been
         # been called.
         self._connect_things_called = False
@@ -194,6 +189,9 @@ class AbstractBoard:
         """
         self._is_fs = is_fs
 
+        self._setup_memory_ranges()
+        self._setup_board()
+
     def is_fullsystem(self) -> bool:
         """
         Returns ``True`` if the board is to be run in FS mode. Otherwise the board
@@ -252,11 +250,14 @@ class AbstractBoard:
     @abstractmethod
     def _setup_board(self) -> None:
         """
-        This function is called in the AbstractBoard constructor, before the
-        memory, processor, and cache hierarchy components are incorporated via
-        ``_connect_thing()``, but after the ``_setup_memory_ranges()`` function.
-        This function should be overridden by boards to specify components,
-        connections unique to that board.
+        This function is called at the end of `_set_fullsystem`. The reason for
+        this is the board's configuraiton varies significantly depending on
+        whether it is to be run in FS or SE mode. This function is therefore
+        called when a workload is set --- after construction but before
+        `_pre_instantiate` is called.
+
+        As `_setup_memory_ranges()` is set in the constructor, this function
+        can be considered to have been called prior to `_setup_board
         """
         raise NotImplementedError
 
@@ -330,10 +331,18 @@ class AbstractBoard:
         """
         Set the memory ranges for this board and memory system.
 
-        This is called in the constructor, prior to ``_setup_board`` and
-        ``_connect_things``. It should query the board's memory to determine the
-        size and the set the memory ranges on the memory system and on the
-        board.
+        This is called at the end of the `_set_fullsystem` function but before
+        `_setup_board`.  `_set_fullsystem` is called when the workload is
+        declared. It is before `_pre_instantiate` (but, obviously after
+        construction).
+
+        It should query the board's memory
+        to determine the size and the set the memory ranges on the memory
+        system and on the board.
+
+        As thisis called at the end of `_set_fullsystem`, the board's memory
+        can be setup differently depending on whether the board is to be run in
+        FS or SE mode.
 
         The simplest implementation sets the board's memory range to the size
         of memory and memory system's range to be the same as the board. Full
@@ -391,13 +400,42 @@ class AbstractBoard:
             self.get_cache_hierarchy()._post_instantiate()
         self.get_memory()._post_instantiate()
 
-    def _pre_instantiate(self):
+    def _pre_instantiate(self, full_system: Optional[bool] = None) -> Root:
         """To be called immediately before ``m5.instantiate``. This is where
-        ``_connect_things`` is executed by default."""
+        ``_connect_things`` is executed by default and the root object is Root
+        object is created and returned.
 
-        # Connect the memory, processor, and cache hierarchy.
+        :param full_system: Used to pass the full system flag to the board from
+                            the Simulator module. **Note**: This was
+                            implemented solely to maintain backawards
+                            compatibility with while the Simululator module's
+                            `full_system` flag is in state of deprecation. This
+                            parameter will be removed when it is. When this
+                            occurs whether a simulation is to be run in FS or
+                            SE mode will be determined by the board set."""
+
+        # 1. Connect the memory, processor, and cache hierarchy.
         self._connect_things()
 
+        # 2. Create the root object
+        root = Root(
+            full_system=(
+                full_system
+                if full_system is not None
+                else self.is_fullsystem()
+            ),
+            board=self,
+        )
+
+        # 3. Call any of the components' `_pre_instantiate` functions.
+        self.get_processor()._pre_instantiate(root)
+        self.get_memory()._pre_instantiate(root)
+        if self.get_cache_hierarchy():
+            self.get_cache_hierarchy()._pre_instantiate(root)
+
+        # 4. Return the root object.
+        return root
+
     def _connect_things_check(self):
         """
         Here we check that connect things has been called and throw an
diff --git a/src/python/gem5/components/boards/abstract_system_board.py b/src/python/gem5/components/boards/abstract_system_board.py
index 8fe48920b5..a8765ee909 100644
--- a/src/python/gem5/components/boards/abstract_system_board.py
+++ b/src/python/gem5/components/boards/abstract_system_board.py
@@ -36,7 +36,6 @@ from .abstract_board import AbstractBoard
 
 
 class AbstractSystemBoard(System, AbstractBoard):
-
     """
     An abstract board for cases where boards should inherit from System.
     """
diff --git a/src/python/gem5/components/boards/arm_board.py b/src/python/gem5/components/boards/arm_board.py
index 0a0cd2fa28..2da8cd18f2 100644
--- a/src/python/gem5/components/boards/arm_board.py
+++ b/src/python/gem5/components/boards/arm_board.py
@@ -28,6 +28,7 @@ import os
 from abc import ABCMeta
 from typing import (
     List,
+    Optional,
     Sequence,
     Tuple,
 )
@@ -274,11 +275,15 @@ class ArmBoard(ArmSystem, AbstractBoard, KernelDiskWorkload):
 
     @overrides(AbstractBoard)
     def get_mem_ports(self) -> Sequence[Tuple[AddrRange, Port]]:
-        all_ports = [
-            (self.realview.bootmem.range, self.realview.bootmem.port),
-        ] + self.get_memory().get_mem_ports()
+        # Note: Ruby needs to create a directory for the realview bootmem
+        if self.get_cache_hierarchy().is_ruby():
+            all_ports = [
+                (self.realview.bootmem.range, self.realview.bootmem.port),
+            ] + self.get_memory().get_mem_ports()
 
-        return all_ports
+            return all_ports
+
+        return super().get_mem_ports()
 
     @overrides(AbstractBoard)
     def has_io_bus(self) -> bool:
@@ -327,8 +332,8 @@ class ArmBoard(ArmSystem, AbstractBoard, KernelDiskWorkload):
         self.system_port = port
 
     @overrides(AbstractBoard)
-    def _pre_instantiate(self):
-        super()._pre_instantiate()
+    def _pre_instantiate(self, full_system: Optional[bool] = None) -> None:
+        super()._pre_instantiate(full_system=full_system)
 
         # Add the PCI devices.
         self.pci_devices = self._pci_devices
diff --git a/src/python/gem5/components/boards/riscv_board.py b/src/python/gem5/components/boards/riscv_board.py
index e8e27029f2..e14833c996 100644
--- a/src/python/gem5/components/boards/riscv_board.py
+++ b/src/python/gem5/components/boards/riscv_board.py
@@ -26,7 +26,10 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
-from typing import List
+from typing import (
+    List,
+    Optional,
+)
 
 import m5
 from m5.objects import (
@@ -498,7 +501,7 @@ class RiscvBoard(AbstractSystemBoard, KernelDiskWorkload):
         return "/dev/vda"
 
     @overrides(AbstractSystemBoard)
-    def _pre_instantiate(self):
+    def _pre_instantiate(self, full_system: Optional[bool] = None):
         if len(self._bootloader) > 0:
             self.workload.bootloader_addr = 0x0
             self.workload.bootloader_filename = self._bootloader[0]
@@ -507,7 +510,7 @@ class RiscvBoard(AbstractSystemBoard, KernelDiskWorkload):
         else:
             self.workload.kernel_addr = 0x0
             self.workload.entry_point = 0x80000000
-        self._connect_things()
+        super()._pre_instantiate(full_system=full_system)
 
     @overrides(KernelDiskWorkload)
     def _add_disk_to_board(self, disk_image: AbstractResource):
diff --git a/src/python/gem5/components/boards/test_board.py b/src/python/gem5/components/boards/test_board.py
index 2599c6853d..6acce79b1c 100644
--- a/src/python/gem5/components/boards/test_board.py
+++ b/src/python/gem5/components/boards/test_board.py
@@ -44,7 +44,6 @@ from .abstract_system_board import AbstractSystemBoard
 
 
 class TestBoard(AbstractSystemBoard):
-
     """This is a Testing Board used to run traffic generators on a simple
     architecture.
 
diff --git a/src/python/gem5/components/cachehierarchies/abstract_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/abstract_cache_hierarchy.py
index b0435543af..dc20c14f70 100644
--- a/src/python/gem5/components/cachehierarchies/abstract_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/abstract_cache_hierarchy.py
@@ -42,7 +42,10 @@ from abc import (
 )
 from typing import Callable
 
-from m5.objects import SubSystem
+from m5.objects import (
+    Root,
+    SubSystem,
+)
 from m5.util.fdthelper import *
 
 from ..boards.abstract_board import AbstractBoard
@@ -139,6 +142,18 @@ class AbstractCacheHierarchy(SubSystem):
         """
         raise NotImplementedError
 
+    def _pre_instantiate(self, root: Root) -> None:
+        """Called in the `AbstractBoard`'s `_pre_instantiate` method. This is
+        called after `connect_things`, after the creation of the root object
+        (which is passed in as an argument), but before `m5.instantiate`).
+
+        Subclasses should override this method to set up any connections.
+
+        At present there is no general task that must be specified here and is
+        default or applicable to all cache hierarchies.
+        """
+        pass
+
     def _post_instantiate(self):
         """Called to set up anything needed after ``m5.instantiate``."""
         pass
diff --git a/src/python/gem5/components/cachehierarchies/chi/private_l1_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/chi/private_l1_cache_hierarchy.py
index 29df2a969c..42c4e2258c 100644
--- a/src/python/gem5/components/cachehierarchies/chi/private_l1_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/chi/private_l1_cache_hierarchy.py
@@ -82,6 +82,7 @@ class PrivateL1CacheHierarchy(AbstractRubyCacheHierarchy):
 
     @overrides(AbstractCacheHierarchy)
     def incorporate_cache(self, board: AbstractBoard) -> None:
+        super().incorporate_cache(board)
         self.ruby_system = RubySystem()
 
         # Ruby's global network.
@@ -137,7 +138,9 @@ class PrivateL1CacheHierarchy(AbstractRubyCacheHierarchy):
 
         # Set up a proxy port for the system_port. Used for load binaries and
         # other functional-only things.
-        self.ruby_system.sys_port_proxy = RubyPortProxy()
+        self.ruby_system.sys_port_proxy = RubyPortProxy(
+            ruby_system=self.ruby_system
+        )
         board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)
 
     def _create_core_cluster(
@@ -167,12 +170,16 @@ class PrivateL1CacheHierarchy(AbstractRubyCacheHierarchy):
         )
 
         cluster.icache.sequencer = RubySequencer(
-            version=core_num, dcache=NULL, clk_domain=cluster.icache.clk_domain
+            version=core_num,
+            dcache=NULL,
+            clk_domain=cluster.icache.clk_domain,
+            ruby_system=self.ruby_system,
         )
         cluster.dcache.sequencer = RubySequencer(
             version=core_num,
             dcache=cluster.dcache.cache,
             clk_domain=cluster.dcache.clk_domain,
+            ruby_system=self.ruby_system,
         )
 
         if board.has_io_bus():
@@ -223,7 +230,11 @@ class PrivateL1CacheHierarchy(AbstractRubyCacheHierarchy):
                 board.get_clock_domain(),
             )
             version = len(board.get_processor().get_cores()) + i
-            ctrl.sequencer = RubySequencer(version=version, in_ports=port)
+            ctrl.sequencer = RubySequencer(
+                version=version,
+                in_ports=port,
+                ruby_system=self.ruby_system,
+            )
             ctrl.sequencer.dcache = NULL
 
             ctrl.ruby_system = self.ruby_system
@@ -234,3 +245,10 @@ class PrivateL1CacheHierarchy(AbstractRubyCacheHierarchy):
             dma_controllers.append(ctrl)
 
         return dma_controllers
+
+    @overrides(AbstractRubyCacheHierarchy)
+    def _reset_version_numbers(self):
+        from .nodes.abstract_node import AbstractNode
+
+        AbstractNode._version = 0
+        MemoryController._version = 0
diff --git a/src/python/gem5/components/cachehierarchies/classic/no_cache.py b/src/python/gem5/components/cachehierarchies/classic/no_cache.py
index e6ec89b660..c3c791f4e0 100644
--- a/src/python/gem5/components/cachehierarchies/classic/no_cache.py
+++ b/src/python/gem5/components/cachehierarchies/classic/no_cache.py
@@ -124,7 +124,7 @@ class NoCache(AbstractClassicCacheHierarchy):
         # Set up the system port for functional access from the simulator.
         board.connect_system_port(self.membus.cpu_side_ports)
 
-        for _, port in board.get_memory().get_mem_ports():
+        for _, port in board.get_mem_ports():
             self.membus.mem_side_ports = port
 
     def _setup_coherent_io_bridge(self, board: AbstractBoard) -> None:
diff --git a/src/python/gem5/components/cachehierarchies/classic/private_l1_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/classic/private_l1_cache_hierarchy.py
index 8f63d3320f..9382d11036 100644
--- a/src/python/gem5/components/cachehierarchies/classic/private_l1_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/classic/private_l1_cache_hierarchy.py
@@ -96,7 +96,7 @@ class PrivateL1CacheHierarchy(AbstractClassicCacheHierarchy):
         # Set up the system port for functional access from the simulator.
         board.connect_system_port(self.membus.cpu_side_ports)
 
-        for _, port in board.get_memory().get_mem_ports():
+        for _, port in board.get_mem_ports():
             self.membus.mem_side_ports = port
 
         self.l1icaches = [
diff --git a/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_cache_hierarchy.py
index 049d0fb102..354d9d064d 100644
--- a/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_cache_hierarchy.py
@@ -126,7 +126,7 @@ class PrivateL1PrivateL2CacheHierarchy(
         # Set up the system port for functional access from the simulator.
         board.connect_system_port(self.membus.cpu_side_ports)
 
-        for _, port in board.get_memory().get_mem_ports():
+        for _, port in board.get_mem_ports():
             self.membus.mem_side_ports = port
 
         self.l2buses = [
diff --git a/src/python/gem5/components/cachehierarchies/classic/private_l1_shared_l2_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/classic/private_l1_shared_l2_cache_hierarchy.py
index 4a896b2292..1f0d62d541 100644
--- a/src/python/gem5/components/cachehierarchies/classic/private_l1_shared_l2_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/classic/private_l1_shared_l2_cache_hierarchy.py
@@ -119,7 +119,7 @@ class PrivateL1SharedL2CacheHierarchy(
         # Set up the system port for functional access from the simulator.
         board.connect_system_port(self.membus.cpu_side_ports)
 
-        for _, port in board.get_memory().get_mem_ports():
+        for _, port in board.get_mem_ports():
             self.membus.mem_side_ports = port
 
         self.l1icaches = [
diff --git a/src/python/gem5/components/cachehierarchies/ruby/abstract_ruby_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/ruby/abstract_ruby_cache_hierarchy.py
index 3528b74495..6e7e957934 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/abstract_ruby_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/abstract_ruby_cache_hierarchy.py
@@ -37,6 +37,18 @@ class AbstractRubyCacheHierarchy(AbstractCacheHierarchy):
     def __init__(self):
         super().__init__()
 
+    def _reset_version_numbers(self):
+        """Needed for multiple ruby systems so that each system starts at 0.
+
+        Note: This needs to be overridden by the protocol since we don't know
+        the machine classes at this point.
+        """
+        raise NotImplementedError
+
+    @overrides(AbstractCacheHierarchy)
+    def incorporate_cache(self, board):
+        self._reset_version_numbers()
+
     @overrides(AbstractCacheHierarchy)
     def is_ruby(self) -> bool:
         return True
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/directory.py b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/directory.py
index 4840e3b264..d0c54840fc 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/directory.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/directory.py
@@ -37,7 +37,7 @@ class Directory(AbstractDirectory):
     def __init__(self, network, cache_line_size, mem_range, port):
         super().__init__(network, cache_line_size)
         self.addr_ranges = [mem_range]
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(block_size=cache_line_size)
         # Connect this directory to the memory side.
         self.memory_out_port = port
 
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l1_cache.py b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l1_cache.py
index 6d203f978a..ef90ac79f6 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l1_cache.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l1_cache.py
@@ -80,7 +80,7 @@ class L1Cache(L0Cache_Controller):
             replacement_policy=LRURP(),
         )
         self.clk_domain = clk_domain
-        self.prefetcher = RubyPrefetcher()
+        self.prefetcher = RubyPrefetcher(block_size=cache_line_size)
         self.send_evictions = core.requires_send_evicts()
         self.transitions_per_cycle = 32
         self.enable_prefetch = False
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l2_cache.py b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l2_cache.py
index ff2b8e3dd9..7c473f8be9 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l2_cache.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_three_level/l2_cache.py
@@ -75,7 +75,7 @@ class L2Cache(L1Cache_Controller):
         self.l2_select_num_bits = int(math.log(num_l3Caches, 2))
         self.cluster_id = cluster_id
         self.clk_domain = clk_domain
-        self.prefetcher = RubyPrefetcher()
+        self.prefetcher = RubyPrefetcher(block_size=cache_line_size)
         self.transitions_per_cycle = 32
         # l1_request_latency, l1_response_latency, to_l2_latency are
         # ruby backend terminology.
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/directory.py b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/directory.py
index 4840e3b264..d0c54840fc 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/directory.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/directory.py
@@ -37,7 +37,7 @@ class Directory(AbstractDirectory):
     def __init__(self, network, cache_line_size, mem_range, port):
         super().__init__(network, cache_line_size)
         self.addr_ranges = [mem_range]
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(block_size=cache_line_size)
         # Connect this directory to the memory side.
         self.memory_out_port = port
 
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/l1_cache.py b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/l1_cache.py
index 7787644c9b..13625beea7 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/l1_cache.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/mesi_two_level/l1_cache.py
@@ -73,7 +73,7 @@ class L1Cache(AbstractL1Cache):
         )
         self.l2_select_num_bits = int(math.log(num_l2Caches, 2))
         self.clk_domain = clk_domain
-        self.prefetcher = RubyPrefetcher()
+        self.prefetcher = RubyPrefetcher(block_size=cache_line_size)
         self.send_evictions = core.requires_send_evicts()
         self.transitions_per_cycle = 4
         self.enable_prefetch = False
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/mi_example/directory.py b/src/python/gem5/components/cachehierarchies/ruby/caches/mi_example/directory.py
index 3d1ae54104..79e40e9e01 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/mi_example/directory.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/mi_example/directory.py
@@ -41,7 +41,7 @@ class Directory(AbstractDirectory):
     def __init__(self, network, cache_line_size, mem_range, port):
         super().__init__(network, cache_line_size)
         self.addr_ranges = [mem_range]
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(block_size=cache_line_size)
         # Connect this directory to the memory side.
         self.memory_out_port = port
 
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/core_complex.py b/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/core_complex.py
index 9aa0dc4a36..212c06c4c3 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/core_complex.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/core_complex.py
@@ -143,6 +143,7 @@ class CoreComplex(SubSystem, RubyNetworkComponent):
             version=core_id,
             dcache=cluster.l1_cache.Dcache,
             clk_domain=cluster.l1_cache.clk_domain,
+            ruby_system=self._ruby_system,
         )
 
         if self._board.has_io_bus():
diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/octopi.py b/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/octopi.py
index f7d4d63de1..d576ae6ae4 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/octopi.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/caches/prebuilt/octopi_cache/octopi.py
@@ -38,6 +38,7 @@ from ......components.cachehierarchies.ruby.caches.mesi_three_level.directory im
 from ......components.cachehierarchies.ruby.caches.mesi_three_level.dma_controller import (
     DMAController,
 )
+from ......utils.override import overrides
 from ......utils.requires import requires
 from ....abstract_three_level_cache_hierarchy import (
     AbstractThreeLevelCacheHierarchy,
@@ -95,6 +96,7 @@ class OctopiCache(
         requires(
             coherence_protocol_required=CoherenceProtocol.MESI_THREE_LEVEL
         )
+        super().incorporate_cache(board)
 
         cache_line_size = board.get_cache_line_size()
 
@@ -151,7 +153,9 @@ class OctopiCache(
 
         # Set up a proxy port for the system_port. Used for load binaries and
         # other functional-only things.
-        self.ruby_system.sys_port_proxy = RubyPortProxy()
+        self.ruby_system.sys_port_proxy = RubyPortProxy(
+            ruby_system=self.ruby_system
+        )
         board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)
 
     def _create_directory_controllers(self, board):
@@ -228,7 +232,11 @@ class OctopiCache(
         if board.has_dma_ports():
             self.ruby_system.dma_controllers = [
                 DMAController(
-                    dma_sequencer=DMASequencer(version=i + 1, in_ports=port),
+                    dma_sequencer=DMASequencer(
+                        version=i + 1,
+                        in_ports=port,
+                        ruby_system=self.ruby_system,
+                    ),
                     ruby_system=self.ruby_system,
                 )
                 for i, port in enumerate(board.get_dma_ports())
@@ -261,3 +269,15 @@ class OctopiCache(
             ]
             for link in self.dma_int_links:
                 self.ruby_system.network._add_int_link(link)
+
+    @overrides(AbstractRubyCacheHierarchy)
+    def _reset_version_numbers(self):
+        from ....caches.mesi_three_level.l1_cache import L1Cache
+        from ....caches.mesi_three_level.l2_cache import L2Cache
+        from ....caches.mesi_three_level.l3_cache import L3Cache
+
+        Directory._version = 0
+        L1Cache._version = 0
+        L2Cache._version = 0
+        L3Cache._version = 0
+        DMAController._version = 0
diff --git a/src/python/gem5/components/cachehierarchies/ruby/mesi_three_level_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/ruby/mesi_three_level_cache_hierarchy.py
index 66fea95636..501fbab081 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/mesi_three_level_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/mesi_three_level_cache_hierarchy.py
@@ -33,6 +33,7 @@ from m5.objects import (
 )
 
 from ....coherence_protocol import CoherenceProtocol
+from ....utils.override import overrides
 from ....utils.requires import requires
 
 requires(coherence_protocol_required=CoherenceProtocol.MESI_THREE_LEVEL)
@@ -87,6 +88,7 @@ class MESIThreeLevelCacheHierarchy(
         self._num_l3_banks = num_l3_banks
 
     def incorporate_cache(self, board: AbstractBoard) -> None:
+        super().incorporate_cache(board)
         cache_line_size = board.get_cache_line_size()
 
         self.ruby_system = RubySystem()
@@ -118,6 +120,7 @@ class MESIThreeLevelCacheHierarchy(
                 version=core_idx,
                 dcache=l1_cache.Dcache,
                 clk_domain=l1_cache.clk_domain,
+                ruby_system=self.ruby_system,
             )
 
             if board.has_io_bus():
@@ -196,7 +199,12 @@ class MESIThreeLevelCacheHierarchy(
             dma_ports = board.get_dma_ports()
             for i, port in enumerate(dma_ports):
                 ctrl = DMAController(
-                    DMASequencer(version=i, in_ports=port), self.ruby_system
+                    DMASequencer(
+                        version=i,
+                        in_ports=port,
+                        ruby_system=self.ruby_system,
+                    ),
+                    self.ruby_system,
                 )
                 self._dma_controllers.append(ctrl)
 
@@ -223,5 +231,15 @@ class MESIThreeLevelCacheHierarchy(
 
         # Set up a proxy port for the system_port. Used for load binaries and
         # other functional-only things.
-        self.ruby_system.sys_port_proxy = RubyPortProxy()
+        self.ruby_system.sys_port_proxy = RubyPortProxy(
+            ruby_system=self.ruby_system
+        )
         board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)
+
+    @overrides(AbstractRubyCacheHierarchy)
+    def _reset_version_numbers(self):
+        Directory._version = 0
+        L1Cache._version = 0
+        L2Cache._version = 0
+        L3Cache._version = 0
+        DMAController._version = 0
diff --git a/src/python/gem5/components/cachehierarchies/ruby/mesi_two_level_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/ruby/mesi_two_level_cache_hierarchy.py
index 004c2ff9d2..52a14c7681 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/mesi_two_level_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/mesi_two_level_cache_hierarchy.py
@@ -33,6 +33,7 @@ from m5.objects import (
 )
 
 from ....coherence_protocol import CoherenceProtocol
+from ....utils.override import overrides
 from ....utils.requires import requires
 
 requires(coherence_protocol_required=CoherenceProtocol.MESI_TWO_LEVEL)
@@ -83,6 +84,7 @@ class MESITwoLevelCacheHierarchy(
         self._num_l2_banks = num_l2_banks
 
     def incorporate_cache(self, board: AbstractBoard) -> None:
+        super().incorporate_cache(board)
         cache_line_size = board.get_cache_line_size()
 
         self.ruby_system = RubySystem()
@@ -109,7 +111,10 @@ class MESITwoLevelCacheHierarchy(
             )
 
             cache.sequencer = RubySequencer(
-                version=i, dcache=cache.L1Dcache, clk_domain=cache.clk_domain
+                version=i,
+                dcache=cache.L1Dcache,
+                clk_domain=cache.clk_domain,
+                ruby_system=self.ruby_system,
             )
 
             if board.has_io_bus():
@@ -163,7 +168,11 @@ class MESITwoLevelCacheHierarchy(
             dma_ports = board.get_dma_ports()
             for i, port in enumerate(dma_ports):
                 ctrl = DMAController(self.ruby_system.network, cache_line_size)
-                ctrl.dma_sequencer = DMASequencer(version=i, in_ports=port)
+                ctrl.dma_sequencer = DMASequencer(
+                    version=i,
+                    in_ports=port,
+                    ruby_system=self.ruby_system,
+                )
                 self._dma_controllers.append(ctrl)
                 ctrl.ruby_system = self.ruby_system
 
@@ -188,5 +197,14 @@ class MESITwoLevelCacheHierarchy(
 
         # Set up a proxy port for the system_port. Used for load binaries and
         # other functional-only things.
-        self.ruby_system.sys_port_proxy = RubyPortProxy()
+        self.ruby_system.sys_port_proxy = RubyPortProxy(
+            ruby_system=self.ruby_system
+        )
         board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)
+
+    @overrides(AbstractRubyCacheHierarchy)
+    def _reset_version_numbers(self):
+        Directory._version = 0
+        L1Cache._version = 0
+        L2Cache._version = 0
+        DMAController._version = 0
diff --git a/src/python/gem5/components/cachehierarchies/ruby/mi_example_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/ruby/mi_example_cache_hierarchy.py
index 478c793560..271bc42536 100644
--- a/src/python/gem5/components/cachehierarchies/ruby/mi_example_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/ruby/mi_example_cache_hierarchy.py
@@ -32,6 +32,7 @@ from m5.objects import (
 )
 
 from ....coherence_protocol import CoherenceProtocol
+from ....utils.override import overrides
 from ....utils.requires import requires
 
 requires(coherence_protocol_required=CoherenceProtocol.MI_EXAMPLE)
@@ -65,6 +66,7 @@ class MIExampleCacheHierarchy(AbstractRubyCacheHierarchy):
 
     @overrides(AbstractCacheHierarchy)
     def incorporate_cache(self, board: AbstractBoard) -> None:
+        super().incorporate_cache(board)
         self.ruby_system = RubySystem()
 
         # Ruby's global network.
@@ -95,6 +97,7 @@ class MIExampleCacheHierarchy(AbstractRubyCacheHierarchy):
                 version=i,
                 dcache=cache.cacheMemory,
                 clk_domain=cache.clk_domain,
+                ruby_system=self.ruby_system,
             )
 
             if board.has_io_bus():
@@ -140,7 +143,11 @@ class MIExampleCacheHierarchy(AbstractRubyCacheHierarchy):
                 ctrl = DMAController(
                     self.ruby_system.network, board.get_cache_line_size()
                 )
-                ctrl.dma_sequencer = DMASequencer(version=i, in_ports=port)
+                ctrl.dma_sequencer = DMASequencer(
+                    version=i,
+                    in_ports=port,
+                    ruby_system=self.ruby_system,
+                )
 
                 ctrl.ruby_system = self.ruby_system
                 ctrl.dma_sequencer.ruby_system = self.ruby_system
@@ -167,5 +174,13 @@ class MIExampleCacheHierarchy(AbstractRubyCacheHierarchy):
 
         # Set up a proxy port for the system_port. Used for load binaries and
         # other functional-only things.
-        self.ruby_system.sys_port_proxy = RubyPortProxy()
+        self.ruby_system.sys_port_proxy = RubyPortProxy(
+            ruby_system=self.ruby_system
+        )
         board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)
+
+    @overrides(AbstractRubyCacheHierarchy)
+    def _reset_version_numbers(self):
+        Directory._version = 0
+        L1Cache._version = 0
+        DMAController._version = 0
diff --git a/src/python/gem5/components/memory/abstract_memory_system.py b/src/python/gem5/components/memory/abstract_memory_system.py
index 06fa60cad8..6d24e724b6 100644
--- a/src/python/gem5/components/memory/abstract_memory_system.py
+++ b/src/python/gem5/components/memory/abstract_memory_system.py
@@ -38,6 +38,7 @@ from m5.objects import (
     AddrRange,
     MemCtrl,
     Port,
+    Root,
     SubSystem,
 )
 
@@ -50,6 +51,18 @@ class AbstractMemorySystem(SubSystem):
     def __init__(self) -> None:
         super().__init__()
 
+    def _pre_instantiate(self, root: Root) -> None:
+        """Called in the `AbstractBoard`'s `_pre_instantiate` method. This is
+        called after `connect_things`, after the creation of the root object
+        (which is passed in as an argument), but before `m5.instantiate`).
+
+        Subclasses should override this method to set up any connections.
+
+        At present there is no general task that must be specified here and is
+        default or applicable to all memory systems.
+        """
+        pass
+
     @abstractmethod
     def incorporate_memory(self, board: AbstractBoard) -> None:
         """This function completes all of the necessary steps to add this
diff --git a/src/python/gem5/components/processors/abstract_processor.py b/src/python/gem5/components/processors/abstract_processor.py
index 79dba438a2..303b9658f2 100644
--- a/src/python/gem5/components/processors/abstract_processor.py
+++ b/src/python/gem5/components/processors/abstract_processor.py
@@ -33,7 +33,10 @@ from typing import (
     Optional,
 )
 
-from m5.objects import SubSystem
+from m5.objects import (
+    Root,
+    SubSystem,
+)
 
 from ...isas import ISA
 from ...utils.requires import requires
@@ -83,3 +86,12 @@ class AbstractProcessor(SubSystem):
     def _post_instantiate(self) -> None:
         """Called to set up anything needed after ``m5.instantiate``."""
         pass
+
+    def _pre_instantiate(self, root: Root) -> None:
+        """Called in the `AbstractBoard`'s `_pre_instantiate` method. This is
+        called after `connect_things`, after the creation of the root object
+        (which is passed in as an argument), but before `m5.instantiate`).
+
+        Subclasses should override this method to set up any connections.
+        """
+        pass
diff --git a/src/python/gem5/components/processors/base_cpu_processor.py b/src/python/gem5/components/processors/base_cpu_processor.py
index b1a63ea8ce..674148b409 100644
--- a/src/python/gem5/components/processors/base_cpu_processor.py
+++ b/src/python/gem5/components/processors/base_cpu_processor.py
@@ -27,12 +27,14 @@
 
 from typing import List
 
+import m5
 from m5.objects import (
     BaseAtomicSimpleCPU,
     BaseMinorCPU,
     BaseNonCachingSimpleCPU,
     BaseO3CPU,
     BaseTimingSimpleCPU,
+    Root,
 )
 from m5.util import warn
 
@@ -99,3 +101,9 @@ class BaseCPUProcessor(AbstractProcessor):
                 board.set_mem_mode(MemMode.ATOMIC)
         else:
             raise NotImplementedError
+
+    def _pre_instantiate(self, root: Root) -> None:
+        super()._pre_instantiate(root)
+        if any(core.is_kvm_core() for core in self.get_cores()):
+            m5.ticks.fixGlobalFrequency()
+            root.sim_quantum = m5.ticks.fromSeconds(0.001)
diff --git a/src/python/gem5/components/processors/switchable_processor.py b/src/python/gem5/components/processors/switchable_processor.py
index 2436c9e81f..a5a9ae2b6b 100644
--- a/src/python/gem5/components/processors/switchable_processor.py
+++ b/src/python/gem5/components/processors/switchable_processor.py
@@ -31,6 +31,7 @@ from typing import (
 )
 
 import m5
+from m5.objects import Root
 
 from ...utils.override import *
 from ..boards.abstract_board import AbstractBoard
@@ -155,3 +156,24 @@ class SwitchableProcessor(AbstractProcessor):
 
         # Ensure the current processor is updated.
         self._current_cores = to_switch
+
+    def _pre_instantiate(self, root: Root) -> None:
+        super()._pre_instantiate(root)
+        # The following is a bit of a hack. If a simulation is to use a KVM
+        # core then the `sim_quantum` value must be set. However, in the
+        # case of using a SwitchableProcessor the KVM cores may be
+        # switched out and therefore not accessible via `get_cores()`.
+        # This is the reason for the `isinstance` check.
+        #
+        # We cannot set the `sim_quantum` value in every simulation as
+        # setting it causes the scheduling of exits to be off by the
+        # `sim_quantum` value (something necessary if we are using KVM
+        # cores). Ergo we only set the value of KVM cores are present.
+        #
+        # There is still a bug here in that if the user is switching to and
+        # from KVM and non-KVM cores via the SwitchableProcessor then the
+        # scheduling of exits for the non-KVM cores will be incorrect. This
+        # will be fixed at a later date.
+        if self._prepare_kvm:
+            m5.ticks.fixGlobalFrequency()
+            root.sim_quantum = m5.ticks.fromSeconds(0.001)
diff --git a/src/python/gem5/prebuilt/demo/arm_demo_board.py b/src/python/gem5/prebuilt/demo/arm_demo_board.py
new file mode 100644
index 0000000000..dfbc6d89e2
--- /dev/null
+++ b/src/python/gem5/prebuilt/demo/arm_demo_board.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.objects import (
+    ArmDefaultRelease,
+    VExpress_GEM5_Foundation,
+    VExpress_GEM5_V1,
+)
+from m5.util import warn
+
+from ...components.boards.arm_board import ArmBoard
+from ...components.cachehierarchies.classic.private_l1_shared_l2_cache_hierarchy import (
+    PrivateL1SharedL2CacheHierarchy,
+)
+from ...components.memory import DualChannelDDR4_2400
+from ...components.processors.cpu_types import CPUTypes
+from ...components.processors.simple_processor import SimpleProcessor
+from ...isas import ISA
+from ...utils.requires import requires
+
+
+class ArmDemoBoard(ArmBoard):
+    """
+    This prebuilt ARM board is used for demonstration purposes. It simulates an
+    ARM 3GHz dual-core system with a 4GiB DDR4_2400 memory system. It uses
+    a PrivateL1SharedL2CacheHierarchy with l1d and l1i caches set to 64KiB and
+    l2 shared cache set to 8MiB
+
+    **DISCLAIMER**: This board is solely for demonstration purposes. This board
+    is not known to be representative of any real-world system or produce
+    reliable statistical results.
+    """
+
+    def __init__(self, use_kvm: bool = False) -> None:
+        """
+        :param use_kvm: If True, the board will use a SimpleProcessor
+            with cpu type of CPUTypes.KVM. If False, the board will use a SimpleProcessor with
+            a cpu type of CPUTypes.TIMING.
+        """
+        requires(
+            isa_required=ISA.ARM,
+        )
+
+        warn(
+            "The ARMDemoBoard is solely for demonstration purposes. "
+            "This board is not known to be be representative of any "
+            "real-world system. Use with caution."
+        )
+        cache_hierarchy = PrivateL1SharedL2CacheHierarchy(
+            l1d_size="64KiB", l1i_size="64KiB", l2_size="8MiB"
+        )
+
+        # Note: Normally a system with these specification would have 1
+        # GiB for memory but because some benchmarks would not run with
+        # 1 GiB of memory so we have set it to 4 GiB.
+        memory = DualChannelDDR4_2400(size="4GiB")
+
+        if use_kvm:
+            processor = SimpleProcessor(
+                cpu_type=CPUTypes.KVM, num_cores=2, isa=ISA.ARM
+            )
+            # The ArmBoard requires a `release` to be specified. This adds all the
+            # extensions or features to the system. We are setting this to for_kvm()
+            # to enable KVM simulation.
+            release = ArmDefaultRelease.for_kvm()
+
+            # The platform sets up the memory ranges of all the on-chip and off-chip
+            # devices present on the ARM system. ARM KVM only works with VExpress_GEM5_V1
+            # on the ArmBoard at the moment.
+            platform = VExpress_GEM5_V1()
+
+        else:
+            processor = SimpleProcessor(
+                cpu_type=CPUTypes.TIMING, num_cores=2, isa=ISA.ARM
+            )
+            release = ArmDefaultRelease()
+
+            # The platform sets up the memory ranges of all the on-chip and off-chip
+            # devices present on the ARM system.
+            platform = VExpress_GEM5_Foundation()
+
+        super().__init__(
+            clk_freq="3GHz",
+            processor=processor,
+            memory=memory,
+            cache_hierarchy=cache_hierarchy,
+            release=release,
+            platform=platform,
+        )
diff --git a/src/python/gem5/prebuilt/demo/x86_demo_board.py b/src/python/gem5/prebuilt/demo/x86_demo_board.py
index 793b43a3d1..ac89847f2b 100644
--- a/src/python/gem5/prebuilt/demo/x86_demo_board.py
+++ b/src/python/gem5/prebuilt/demo/x86_demo_board.py
@@ -24,27 +24,33 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from m5.objects import (
+    IOXBar,
+    Pc,
+    Port,
+    X86FsLinux,
+)
 from m5.util import warn
 
-from ...coherence_protocol import CoherenceProtocol
+from ...components.boards.se_binary_workload import SEBinaryWorkload
 from ...components.boards.x86_board import X86Board
-from ...components.cachehierarchies.ruby.mesi_two_level_cache_hierarchy import (
-    MESITwoLevelCacheHierarchy,
+from ...components.cachehierarchies.classic.private_l1_shared_l2_cache_hierarchy import (
+    PrivateL1SharedL2CacheHierarchy,
 )
-from ...components.memory.single_channel import SingleChannelDDR3_1600
+from ...components.memory.multi_channel import DualChannelDDR4_2400
 from ...components.processors.cpu_types import CPUTypes
 from ...components.processors.simple_processor import SimpleProcessor
 from ...isas import ISA
+from ...utils.override import overrides
 from ...utils.requires import requires
 
 
-class X86DemoBoard(X86Board):
+class X86DemoBoard(X86Board, SEBinaryWorkload):
     """
     This prebuilt X86 board is used for demonstration purposes. It simulates
-    an X86 3GHz quad-core system with a 2GiB DDR3_1600 memory system. A
-    MESI_Two_Level cache hierarchy is set with an l1 data and instruction
-    cache, each 32KiB with an associativity of 8, and a single bank l2 cache of
-    1MiB with an associativity of 16.
+    an X86 3GHz dual-core system with a 3GiB DDR4_2400 memory system. The
+    cache hierarchy consists of per-core private L1 instruction and data
+    caches (64KiB each) connected to a shared 8MiB L2 cache.
 
     **DISCLAIMER**: This board is solely for demonstration purposes. This board
     is not known to be representative of any real-world system or produce
@@ -68,7 +74,6 @@ class X86DemoBoard(X86Board):
     def __init__(self):
         requires(
             isa_required=ISA.X86,
-            coherence_protocol_required=CoherenceProtocol.MESI_TWO_LEVEL,
         )
 
         warn(
@@ -77,18 +82,15 @@ class X86DemoBoard(X86Board):
             "real-world system. Use with caution."
         )
 
-        memory = SingleChannelDDR3_1600(size="2GiB")
+        # The other demo boards have 4 GiB of memory, but X86Board can only
+        # support up to 3 GiB.
+        memory = DualChannelDDR4_2400(size="3GiB")
         processor = SimpleProcessor(
-            cpu_type=CPUTypes.TIMING, isa=ISA.X86, num_cores=4
+            cpu_type=CPUTypes.TIMING, isa=ISA.X86, num_cores=2
         )
-        cache_hierarchy = MESITwoLevelCacheHierarchy(
-            l1d_size="32KiB",
-            l1d_assoc=8,
-            l1i_size="32KiB",
-            l1i_assoc=8,
-            l2_size="1MiB",
-            l2_assoc=16,
-            num_l2_banks=1,
+
+        cache_hierarchy = PrivateL1SharedL2CacheHierarchy(
+            l1d_size="64KiB", l1i_size="64KiB", l2_size="8MiB"
         )
 
         super().__init__(
@@ -97,3 +99,46 @@ class X86DemoBoard(X86Board):
             memory=memory,
             cache_hierarchy=cache_hierarchy,
         )
+
+    @overrides(X86Board)
+    def _setup_board(self) -> None:
+        if self._is_fs:
+            self.pc = Pc()
+
+            self.workload = X86FsLinux()
+
+            # North Bridge
+            self.iobus = IOXBar()
+
+            # Set up all of the I/O.
+            self._setup_io_devices()
+
+            self.m5ops_base = 0xFFFF0000
+
+    @overrides(X86Board)
+    def has_io_bus(self) -> bool:
+        return self.is_fullsystem()
+
+    @overrides(X86Board)
+    def get_io_bus(self) -> IOXBar:
+        if self.has_io_bus():
+            return self.iobus
+        else:
+            raise NotImplementedError(
+                "X86DemoBoard does not have an IO bus. "
+                "Use `has_io_bus()` to check this."
+            )
+
+    @overrides(X86Board)
+    def has_coherent_io(self) -> bool:
+        return self.is_fullsystem()
+
+    @overrides(X86Board)
+    def get_mem_side_coherent_io_port(self) -> Port:
+        if self.has_coherent_io():
+            return self.iobus.mem_side_ports
+        else:
+            raise NotImplementedError(
+                "x86DemoBoard does not have any I/O ports. Use has_coherent_io"
+                " to check this."
+            )
diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
index 23a7dcc8cb..ba9588c725 100644
--- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
+++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
@@ -313,7 +313,7 @@ class RISCVMatchedBoard(
             memory.set_memory_range(self.mem_ranges)
 
     @overrides(AbstractSystemBoard)
-    def _pre_instantiate(self):
+    def _pre_instantiate(self, full_system: Optional[bool] = None) -> None:
         if self._fs:
             if len(self._bootloader) > 0:
                 self.workload.bootloader_addr = 0x0
@@ -326,7 +326,7 @@ class RISCVMatchedBoard(
                 self.workload.kernel_addr = 0x0
                 self.workload.entry_point = 0x80000000
 
-        self._connect_things()
+        super()._pre_instantiate(full_system=full_system)
 
     def generate_device_tree(self, outdir: str) -> None:
         """Creates the ``dtb`` and ``dts`` files.
diff --git a/src/python/gem5/simulate/simulator.py b/src/python/gem5/simulate/simulator.py
index 49dfac2bdf..ba74361915 100644
--- a/src/python/gem5/simulate/simulator.py
+++ b/src/python/gem5/simulate/simulator.py
@@ -117,6 +117,10 @@ class Simulator:
                             behavior. If not set, whether or not to run in FS
                             mode will be determined via the board's
                             ``is_fullsystem()`` function.
+                            **Warning: This parameter is deprecated. The board
+                            determines if the simulation is full system or not.
+                            This parameter will be removed in a future gem5
+                            release.**
         :param on_exit_event: An optional map to specify what to execute on
                               each exit event. There are three possibilities here:
                               a generator, a list of functions, or a single function.
@@ -291,6 +295,15 @@ class Simulator:
 
         """
 
+        if full_system is not None:
+            warn(
+                "Setting the full_system parameter via the Simulator "
+                "constructor is deprecated and will be removed in future "
+                "releases of gem5. "
+                "The board determines if the simulation is full system or not "
+                "via it's `is_fullsystem` method."
+            )
+
         self.set_max_ticks(max_ticks)
 
         if id:
@@ -651,45 +664,12 @@ class Simulator:
 
         if not self._instantiated:
             # Before anything else we run the AbstractBoard's
-            # `_pre_instantiate` function.
-            self._board._pre_instantiate()
-
-            root = Root(
-                full_system=(
-                    self._full_system
-                    if self._full_system is not None
-                    else self._board.is_fullsystem()
-                ),
-                board=self._board,
+            # `_pre_instantiate` function. This returns the root object which
+            # is required for instantiation.
+            self._root = self._board._pre_instantiate(
+                full_system=self._full_system
             )
 
-            # We take a copy of the Root in case it's required elsewhere
-            # (for example, in `get_stats()`).
-            self._root = root
-
-            # The following is a bit of a hack. If a simulation is to use a KVM
-            # core then the `sim_quantum` value must be set. However, in the
-            # case of using a SwitchableProcessor the KVM cores may be
-            # switched out and therefore not accessible via `get_cores()`.
-            # This is the reason for the `isinstance` check.
-            #
-            # We cannot set the `sim_quantum` value in every simulation as
-            # setting it causes the scheduling of exits to be off by the
-            # `sim_quantum` value (something necessary if we are using KVM
-            # cores). Ergo we only set the value of KVM cores are present.
-            #
-            # There is still a bug here in that if the user is switching to and
-            # from KVM and non-KVM cores via the SwitchableProcessor then the
-            # scheduling of exits for the non-KVM cores will be incorrect. This
-            # will be fixed at a later date.
-            processor = self._board.processor
-            if any(core.is_kvm_core() for core in processor.get_cores()) or (
-                isinstance(processor, SwitchableProcessor)
-                and any(core.is_kvm_core() for core in processor._all_cores())
-            ):
-                m5.ticks.fixGlobalFrequency()
-                root.sim_quantum = m5.ticks.fromSeconds(0.001)
-
             # m5.instantiate() takes a parameter specifying the path to the
             # checkpoint directory. If the parameter is None, no checkpoint
             # will be restored.
diff --git a/src/python/m5/SimObject.py b/src/python/m5/SimObject.py
index ce098bea7d..72bf692b6b 100644
--- a/src/python/m5/SimObject.py
+++ b/src/python/m5/SimObject.py
@@ -1259,7 +1259,9 @@ class SimObject(metaclass=MetaSimObject):
         if not self._ccObject:
             # Make sure this object is in the configuration hierarchy
             if not self._parent and not isRoot(self):
-                raise RuntimeError("Attempt to instantiate orphan node")
+                raise RuntimeError(
+                    f"Attempt to instantiate orphan node {self}"
+                )
             # Cycles in the configuration hierarchy are not supported. This
             # will catch the resulting recursion and stop.
             self._ccObject = -1
diff --git a/src/sim/signal.hh b/src/sim/signal.hh
index 233de07658..e89fbe0b9f 100644
--- a/src/sim/signal.hh
+++ b/src/sim/signal.hh
@@ -51,12 +51,11 @@ class SignalSinkPort : public Port
     SignalSourcePort<State> *_source = nullptr;
 
     State _state = {};
-    OnChangeFunc _onChange;
 
   protected:
     // if bypass_on_change is specified true, it will not call the _onChange
     // function. Only _state will be updated if needed.
-    void
+    virtual void
     set(const State &new_state, const bool bypass_on_change = false)
     {
         if (new_state == _state)
@@ -67,6 +66,8 @@ class SignalSinkPort : public Port
             _onChange(_state);
     }
 
+    OnChangeFunc _onChange;
+
   public:
     SignalSinkPort(const std::string &_name, PortID _id=InvalidPortID) :
         Port(_name, _id)
diff --git a/src/systemc/ext/core/sc_export.hh b/src/systemc/ext/core/sc_export.hh
index c93f01a9a3..913cd75a9d 100644
--- a/src/systemc/ext/core/sc_export.hh
+++ b/src/systemc/ext/core/sc_export.hh
@@ -70,6 +70,17 @@ class sc_export : public sc_export_base
 
     virtual const char *kind() const override { return "sc_export"; }
 
+#pragma GCC diagnostic push
+/**
+ * The following warning is disabled because the bind methods are overloaded
+ * in the derived class and the base class. In GCC v13+ this
+ * 'overloaded-virtual' warning is strict enough to trigger here (though the
+ * code is correct).
+ * Please check section 9.3 of SystemC 2.3.1 release note for more details.
+ */
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 13))
+#pragma GCC diagnostic ignored "-Woverloaded-virtual"
+#endif
     void operator () (IF &i) { bind(i); }
     virtual void
     bind(IF &i)
@@ -80,6 +91,7 @@ class sc_export : public sc_export_base
         }
         interface = &i;
     }
+#pragma GCC diagnostic pop
     operator IF & ()
     {
         if (!interface)
diff --git a/src/systemc/ext/core/sc_port.hh b/src/systemc/ext/core/sc_port.hh
index bd57553559..346eb430b1 100644
--- a/src/systemc/ext/core/sc_port.hh
+++ b/src/systemc/ext/core/sc_port.hh
@@ -114,19 +114,27 @@ class sc_port_base : public sc_object
     virtual sc_port_policy _portPolicy() const = 0;
 };
 
-// The overloaded virtual is intended in SystemC, so we'll disable the warning.
-// Please check section 9.3 of SystemC 2.3.1 release note for more details.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Woverloaded-virtual"
 template <class IF>
 class sc_port_b : public sc_port_base
 {
   public:
+#pragma GCC diagnostic push
+/**
+ * The following warning is disabled because the bind methods are overloaded
+ * in the derived class and the base class. In GCC v13+ this
+ * 'overloaded-virtual' warning is strict enough to trigger here (though the
+ * code is correct).
+ * Please check section 9.3 of SystemC 2.3.1 release note for more details.
+ */
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 13))
+#pragma GCC diagnostic ignored "-Woverloaded-virtual"
+#endif
     void operator () (IF &i) { bind(i); }
     void operator () (sc_port_b<IF> &p) { bind(p); }
 
     virtual void bind(IF &i) { sc_port_base::bind(i); }
     virtual void bind(sc_port_b<IF> &p) { sc_port_base::bind(p); }
+#pragma GCC diagnostic pop
 
     IF *
     operator -> ()
@@ -248,7 +256,6 @@ class sc_port_b : public sc_port_base
     sc_port_b(const sc_port_b<IF> &) {}
     sc_port_b<IF> &operator = (const sc_port_b<IF> &) { return *this; }
 };
-#pragma GCC diagnostic pop
 
 template <class IF, int N=1, sc_port_policy P=SC_ONE_OR_MORE_BOUND>
 class sc_port : public sc_port_b<IF>
diff --git a/src/systemc/ext/tlm_core/2/sockets/initiator_socket.hh b/src/systemc/ext/tlm_core/2/sockets/initiator_socket.hh
index 4f67b59237..d4cf3849e3 100644
--- a/src/systemc/ext/tlm_core/2/sockets/initiator_socket.hh
+++ b/src/systemc/ext/tlm_core/2/sockets/initiator_socket.hh
@@ -51,10 +51,6 @@ template <unsigned int BUSWIDTH, typename FW_IF, typename BW_IF, int N,
           sc_core::sc_port_policy POL>
 class tlm_base_target_socket;
 
-// The overloaded virtual is intended in SystemC, so we'll disable the warning.
-// Please check section 9.3 of SystemC 2.3.1 release note for more details.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Woverloaded-virtual"
 template <unsigned int BUSWIDTH=32, typename FW_IF=tlm_fw_transport_if<>,
           typename BW_IF=tlm_bw_transport_if<>, int N=1,
           sc_core::sc_port_policy POL=sc_core::SC_ONE_OR_MORE_BOUND>
@@ -100,6 +96,18 @@ class tlm_base_initiator_socket :
     // - Binds the port of the target socket to the export of the initiator
     //   socket
     //
+
+#pragma GCC diagnostic push
+/**
+ * The following warning is disabled because the bind methods are overloaded
+ * in the derived class and the base class. In GCC v13+ this
+ * 'overloaded-virtual' warning is strict enough to trigger here (though the
+ * code is correct).
+ * Please check section 9.3 of SystemC 2.3.1 release note for more details.
+ */
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 13))
+#pragma GCC diagnostic ignored "-Woverloaded-virtual"
+#endif
     virtual void
     bind(base_target_socket_type &s)
     {
@@ -132,6 +140,7 @@ class tlm_base_initiator_socket :
     //
     virtual void bind(bw_interface_type &ifs) { (get_base_export())(ifs); }
     void operator() (bw_interface_type &s) { bind(s); }
+#pragma GCC diagnostic pop
 
     // Implementation of tlm_base_socket_if functions
     virtual sc_core::sc_port_base &get_port_base() { return *this; }
@@ -174,7 +183,6 @@ class tlm_base_initiator_socket :
   protected:
     export_type m_export;
 };
-#pragma GCC diagnostic pop
 
 //
 // Convenience socket classes
diff --git a/src/systemc/ext/tlm_core/2/sockets/target_socket.hh b/src/systemc/ext/tlm_core/2/sockets/target_socket.hh
index 5da81d892e..a3d3026614 100644
--- a/src/systemc/ext/tlm_core/2/sockets/target_socket.hh
+++ b/src/systemc/ext/tlm_core/2/sockets/target_socket.hh
@@ -98,8 +98,9 @@ class tlm_base_target_socket :
  * in the derived class and the base class. In GCC v13+ this
  * 'overloaded-virtual' warning is strict enough to trigger here (though the
  * code is correct).
+ * Please check section 9.3 of SystemC 2.3.1 release note for more details.
  */
-#if defined(__GNUC__) && (__GNUC__ >= 13)
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 13))
 #pragma GCC diagnostic ignored "-Woverloaded-virtual"
 #endif
     virtual void
diff --git a/tests/gem5/gpu/test_gpu_pannotia.py b/tests/gem5/gpu/test_gpu_pannotia.py
index 0276e79009..9250c3c47b 100644
--- a/tests/gem5/gpu/test_gpu_pannotia.py
+++ b/tests/gem5/gpu/test_gpu_pannotia.py
@@ -27,11 +27,14 @@
 import gzip
 import os.path
 import shutil
+from pathlib import Path
 from urllib.request import urlretrieve
 
 from testlib import *
 
-resource_path = joinpath(absdirpath(__file__), "..", "gpu-pannotia-resources")
+resource_path = joinpath(
+    absdirpath(__file__), "..", "resources", "gpu-pannotia"
+)
 binary_path = joinpath(resource_path, "pannotia-bins")
 dataset_path = joinpath(resource_path, "pannotia-datasets")
 
@@ -52,15 +55,14 @@ if not os.path.isdir(resource_path):
     os.makedirs(dataset_path)
 
     for name in binary_links.keys():
+        if Path(f"{binary_path}/{name}").exists():
+            continue
         urlretrieve(binary_links[name], f"{binary_path}/{name}")
     for name in dataset_links.keys():
+        if Path(f"{dataset_path}/{name}").exists():
+            continue
         urlretrieve(dataset_links[name], f"{dataset_path}/{name}")
 
-    with gzip.open(f"{dataset_path}/USA-road-d.NY.gr.gz", "rb") as f_in:
-        with open(f"{dataset_path}/USA-road-d.NY.gr", "wb") as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    os.remove(f"{dataset_path}/USA-road-d.NY.gr.gz")
-
 if len(os.listdir(binary_path)) < len(binary_links):
     testlib.log.test_log.warn(
         "One or more binaries for the Pannotia GPU tests are missing! Try deleting gpu-pannotia-resources and rerunning."
diff --git a/tests/gem5/kvm_fork_tests/configs/boot_kvm_fork_run.py b/tests/gem5/kvm_fork_tests/configs/boot_kvm_fork_run.py
index 8850a27c75..be6e6009e1 100644
--- a/tests/gem5/kvm_fork_tests/configs/boot_kvm_fork_run.py
+++ b/tests/gem5/kvm_fork_tests/configs/boot_kvm_fork_run.py
@@ -207,15 +207,15 @@ print("Running with ISA: " + processor.get_isa().name)
 print("Running with protocol: " + get_runtime_coherence_protocol().name)
 print()
 
-root = Root(full_system=True, system=motherboard)
+# Disable the gdb ports. Required for forking.
+m5.disableAllListeners()
+root = motherboard._pre_instantiate()
 
 # TODO: This of annoying. Is there a way to fix this to happen
 # automatically when running KVM?
 root.sim_quantum = int(1e9)
 
-# Disable the gdb ports. Required for forking.
-m5.disableAllListeners()
-motherboard._pre_instantiate()
+
 m5.instantiate()
 
 # Simulate the inital boot with the starting KVM cpu
diff --git a/tests/gem5/learning_gem5/ref/test b/tests/gem5/learning_gem5/ref/test
index 309ac2fa40..1e83a06f97 100644
--- a/tests/gem5/learning_gem5/ref/test
+++ b/tests/gem5/learning_gem5/ref/test
@@ -1,3 +1,3 @@
 Global frequency set at 1000000000 ticks per second
 Beginning simulation!
-Exiting @ tick 9981 because Ruby Tester completed
+Exiting @ tick 9831 because Ruby Tester completed
diff --git a/tests/gem5/replacement_policies/configs/run_replacement_policy.py b/tests/gem5/replacement_policies/configs/run_replacement_policy.py
index 8f52a061f6..f7ecdb71de 100644
--- a/tests/gem5/replacement_policies/configs/run_replacement_policy.py
+++ b/tests/gem5/replacement_policies/configs/run_replacement_policy.py
@@ -83,9 +83,8 @@ motherboard = TestBoard(
     memory=memory,
     cache_hierarchy=cache_hierarchy,
 )
-root = Root(full_system=False, system=motherboard)
 
-motherboard._pre_instantiate()
+root = motherboard._pre_instantiate()
 m5.instantiate()
 
 generator.start_traffic()
diff --git a/tests/gem5/replacement_policies/run_replacement_policy.py b/tests/gem5/replacement_policies/run_replacement_policy.py
index 8f52a061f6..f7ecdb71de 100644
--- a/tests/gem5/replacement_policies/run_replacement_policy.py
+++ b/tests/gem5/replacement_policies/run_replacement_policy.py
@@ -83,9 +83,8 @@ motherboard = TestBoard(
     memory=memory,
     cache_hierarchy=cache_hierarchy,
 )
-root = Root(full_system=False, system=motherboard)
 
-motherboard._pre_instantiate()
+root = motherboard._pre_instantiate()
 m5.instantiate()
 
 generator.start_traffic()
diff --git a/tests/gem5/se_mode/rvv_intrinsic_tests/test.py b/tests/gem5/se_mode/rvv_intrinsic_tests/test.py
new file mode 100644
index 0000000000..e20018ba60
--- /dev/null
+++ b/tests/gem5/se_mode/rvv_intrinsic_tests/test.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2024 Barcelona Supercomputing Center
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import re
+import sys
+
+from testlib import *
+
+resources = [
+    "rvv-branch",
+    "rvv-index",
+    "rvv-matmul",
+    "rvv-memcpy",
+    "rvv-reduce",
+    "rvv-saxpy",
+    "rvv-sgemm",
+    "rvv-strcmp",
+    "rvv-strcpy",
+    "rvv-strlen",
+    "rvv-strlen-fault",
+    "rvv-strncpy",
+]
+
+vlens = [2**x for x in range(7, 15)]
+
+for resource in resources:
+    out_verifier = verifier.MatchRegex(re.compile(f"^.*{resource}: pass$"))
+
+    for vlen in vlens:
+        gem5_verify_config(
+            name=f"test-riscv-{resource}-vlen_{vlen}-O3-se-mode",
+            fixtures=(),
+            verifiers=(out_verifier,),
+            config=f"{config.base_dir}/configs/example/gem5_library/riscv-rvv-example.py",
+            config_args=[resource, f"--vlen={vlen}"],
+            valid_isas=(constants.all_compiled_tag,),
+            length=constants.quick_tag,
+        )
diff --git a/tests/gem5/stats/configs/pystat_vector2d_check.py b/tests/gem5/stats/configs/pystat_vector2d_check.py
index 617463e56f..909de12232 100644
--- a/tests/gem5/stats/configs/pystat_vector2d_check.py
+++ b/tests/gem5/stats/configs/pystat_vector2d_check.py
@@ -138,9 +138,11 @@ for x in range(args.num_vectors):
 
     vectors[x_index] = {
         "type": "Vector",
-        "description": stat_tester.subdescs[x]
-        if x in stat_tester.subdescs
-        else stat_tester.description,
+        "description": (
+            stat_tester.subdescs[x]
+            if x in stat_tester.subdescs
+            else stat_tester.description
+        ),
         "value": vector,
     }
 
diff --git a/tests/gem5/traffic_gen/configs/simple_traffic_run.py b/tests/gem5/traffic_gen/configs/simple_traffic_run.py
index 3a850b497d..7c264cefe9 100644
--- a/tests/gem5/traffic_gen/configs/simple_traffic_run.py
+++ b/tests/gem5/traffic_gen/configs/simple_traffic_run.py
@@ -202,9 +202,7 @@ motherboard = TestBoard(
     cache_hierarchy=cache_hierarchy,
 )
 
-root = Root(full_system=False, system=motherboard)
-
-motherboard._pre_instantiate()
+root = motherboard._pre_instantiate()
 m5.instantiate()
 
 generator.start_traffic()
diff --git a/util/dockerfiles/docker-bake.hcl b/util/dockerfiles/docker-bake.hcl
index 05f3b4c94b..3517894684 100644
--- a/util/dockerfiles/docker-bake.hcl
+++ b/util/dockerfiles/docker-bake.hcl
@@ -125,7 +125,8 @@ group "gcc-compilers" {
     "gcc-version-10",
     "gcc-version-11",
     "gcc-version-12",
-    "gcc-version-13"
+    "gcc-version-13",
+    "gcc-version-14"
   ]
 }
 
@@ -169,6 +170,16 @@ target "gcc-version-13" {
   tags = ["${IMAGE_URI}/gcc-version-13:${TAG}"]
 }
 
+target "gcc-version-14" {
+  inherits = ["common"]
+  annotations = ["index,manifest:org.opencontainers.image.description=An image with all dependencies for building gem5 with a GCC v14 compiler."]
+  args = {
+    version = "14"
+  }
+  context = "gcc-compiler"
+  tags = ["${IMAGE_URI}/gcc-version-14:${TAG}"]
+}
+
 group "ubuntu-releases" {
   targets=[
     "ubuntu-24-04_all-dependencies",
diff --git a/util/dockerfiles/gcc-compiler/Dockerfile b/util/dockerfiles/gcc-compiler/Dockerfile
index f36130ebff..8fd5032113 100644
--- a/util/dockerfiles/gcc-compiler/Dockerfile
+++ b/util/dockerfiles/gcc-compiler/Dockerfile
@@ -3,7 +3,7 @@ FROM  ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
 LABEL org.opencontainers.image.source=https://github.com/gem5/gem5
 LABEL org.opencontainers.image.licenses=BSD-3-Clause
 
-ARG version=13 # Version of GCC to install in this image. Default is 13.
+ARG version=14 # Version of GCC to install in this image. Default is 14.
 
 RUN apt -y update && \
     apt -y install gcc-${version} g++-${version} && \
diff --git a/util/minorview/model.py b/util/minorview/model.py
index 91979825c3..d84680fcd3 100644
--- a/util/minorview/model.py
+++ b/util/minorview/model.py
@@ -374,9 +374,9 @@ class TwoDColours(ColourPattern):
 
                 for index, value in parsed:
                     try:
-                        array[index % strips][
-                            index / strips
-                        ] = special_view_decoder(elemClass)(value)
+                        array[index % strips][index / strips] = (
+                            special_view_decoder(elemClass)(value)
+                        )
                     except:
                         print(
                             "Element out of range strips: %d,"
@@ -912,9 +912,9 @@ class BlobModel:
                         blobs = self.unitNameToBlobs.get(unit, [])
                         for blob in blobs:
                             if blob.visualDecoder is not None:
-                                event.visuals[
-                                    blob.picChar
-                                ] = blob.visualDecoder(pairs)
+                                event.visuals[blob.picChar] = (
+                                    blob.visualDecoder(pairs)
+                                )
 
                         self.add_unit_event(event)
                         last_time_lines[unit] = rest