Merge branch 'develop' into update-pannotia-tests

2024-10-18 13:40:59 -07:00
parent ae56a31b21 2e271459d0
commit ddaf70b64f
232 changed files with 4652 additions and 883 deletions
--- a/.github/workflows/ci-tests.yaml
+++ b/.github/workflows/ci-tests.yaml
@@ -5,7 +5,7 @@ name: CI Tests

 on:
    pull_request:
-        types: [opened, edited, synchronize, ready_for_review]
+        types: [opened, synchronize, ready_for_review]

 concurrency:
    group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
@@ -21,17 +21,48 @@ jobs:
            - uses: actions/setup-python@v5
            - uses: pre-commit/action@v3.0.1

+    get-date:
+    # We use the date to label caches. A cache is a a "hit" if the date is the
+    # request binary and date are the same as what is stored in the cache.
+    # This essentially means the first job to run on a given day for a given
+    # binary will always be a "miss" and will have to build the binary then
+    # upload it as that day's binary to upload. While this isn't the most
+    # efficient way to do this, the alternative was to run take a hash of the
+    # `src` directory contents and use it as a hash. We found there to be bugs
+    # with the hash function where this task would timeout. This approach is
+    # simple, works, and still provides some level of caching.
+        runs-on: ubuntu-latest
+        outputs:
+            date: ${{ steps.date.outputs.date }}
+        steps:
+            - name: Get the current date
+              id: date
+              run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+
    unittests-all-opt:
        runs-on: [self-hosted, linux, x64]
        if: github.event.pull_request.draft == false
        container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
-        needs: [pre-commit] # only runs if pre-commit passes.
+        needs: [pre-commit, get-date] # only runs if pre-commit passes.
        timeout-minutes: 60
        steps:
            - uses: actions/checkout@v4
+
+
+            # Restore the cache if available. As this just builds the unittests
+            # we only obtain the cache and do not provide if if is not
+            # available.
+            - name: Cache build/ALL
+              uses: actions/cache/restore@v4
+              with:
+                  path: build/ALL
+                  key: testlib-build-all-${{ env.date }}
+                  restore-keys: |
+                      testlib-build-all
+
            - name: CI Unittests
              working-directory: ${{ github.workspace }}
-              run: scons build/ALL/unittests.opt -j $(nproc)
+              run: scons --no-compress-debug build/ALL/unittests.opt -j $(nproc)
            - run: echo "This job's status is ${{ job.status }}."

    testlib-quick-matrix:
@@ -83,14 +114,24 @@ jobs:
        runs-on: [self-hosted, linux, x64]
        if: github.event.pull_request.draft == false
        container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
-        needs: [pre-commit, testlib-quick-matrix]
+        needs: [pre-commit, testlib-quick-matrix, get-date]
        strategy:
            matrix:
                build-target: ${{ fromJson(needs.testlib-quick-matrix.outputs.build-matrix) }}
        steps:
            - uses: actions/checkout@v4
+
+            - name: Cache build/ALL
+              uses: actions/cache@v4
+              if: ${{ endsWith(matrix.build-target, 'build/ALL/gem5.opt') }}
+              with:
+                  path: build/ALL
+                  key: testlib-build-all-${{ env.date }}
+                  restore-keys: |
+                      testlib-build-all
+
            - name: Build gem5
-              run: scons ${{ matrix.build-target }} -j $(nproc)
+              run: scons --no-compress-debug ${{ matrix.build-target }} -j $(nproc)

        # Upload the gem5 binary as an artifact.
        # Note: the "achor.txt" file is a hack to make sure the paths are
@@ -199,13 +240,23 @@ jobs:
        runs-on: [self-hosted, linux, x64]
        container: ghcr.io/gem5/gcn-gpu:latest
        timeout-minutes: 180
-        needs: [pre-commit]
+        needs: [pre-commit, get-date]
        steps:
            - uses: actions/checkout@v4

+            # Obtain the cache if available. If not available this will upload
+            # this job's instance of the cache.
+            - name: Cache build/VEGA_X86
+              uses: actions/cache@v4
+              with:
+                  path: build/VEGA_X86
+                  key: testlib-build-vega-${{ env.date }}
+                  restore-keys: |
+                      testlib-build-vega
+
            # Build the VEGA_X86/gem5.opt binary.
            - name: Build VEGA_X86/gem5.opt
-              run: scons build/VEGA_X86/gem5.opt -j`nproc`
+              run: scons --no-compress-debug build/VEGA_X86/gem5.opt -j`nproc`

            # Run the GPU tests.
            - name: Run Testlib GPU Tests
--- a/.github/workflows/compiler-tests.yaml
+++ b/.github/workflows/compiler-tests.yaml
@@ -13,8 +13,8 @@ jobs:
        strategy:
            fail-fast: false
            matrix:
-                image: [gcc-version-13, gcc-version-12, gcc-version-11, gcc-version-10, clang-version-18, clang-version-17, clang-version-16, clang-version-15,
-                    clang-version-14, ubuntu-22.04_all-dependencies, ubuntu-24.04_all-dependencies, ubuntu-24.04_min-dependencies]
+                image: [gcc-version-14, gcc-version-13, gcc-version-12, gcc-version-11, gcc-version-10, clang-version-18, clang-version-17, clang-version-16,
+                    clang-version-15, clang-version-14, ubuntu-22.04_all-dependencies, ubuntu-24.04_all-dependencies, ubuntu-24.04_min-dependencies]
                opts: [.opt, .fast]
        runs-on: [self-hosted, linux, x64]
        timeout-minutes: 2880 # 48 hours
@@ -32,7 +32,7 @@ jobs:
            matrix:
                gem5-compilation: [ARM, ARM_MESI_Three_Level, ARM_MESI_Three_Level_HTM, ARM_MOESI_hammer, Garnet_standalone, MIPS, 'NULL', NULL_MESI_Two_Level,
                    NULL_MOESI_CMP_directory, NULL_MOESI_CMP_token, NULL_MOESI_hammer, POWER, RISCV, SPARC, X86, X86_MI_example, X86_MOESI_AMD_Base, VEGA_X86]
-                image: [gcc-version-13, clang-version-18]
+                image: [gcc-version-14, clang-version-18]
                opts: [.opt]
        runs-on: [self-hosted, linux, x64]
        timeout-minutes: 2880 # 48 hours
--- a/.github/workflows/daily-tests.yaml
+++ b/.github/workflows/daily-tests.yaml
@@ -8,6 +8,14 @@ on:
    workflow_dispatch:

 jobs:
+
+    get-date:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Get the current date
+              id: date
+              run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+
  # this builds both unittests.fast and unittests.debug
    unittests-fast-debug:
        strategy:
@@ -16,13 +24,14 @@ jobs:
        runs-on: [self-hosted, linux, x64]
        container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
        timeout-minutes: 60
+        needs: get-date
        steps:
            - uses: actions/checkout@v4
            - name: Cache build/ALL
              uses: actions/cache/restore@v4
              with:
                  path: build/ALL
-                  key: testlib-build-all-${{ hashFiles('src/**') }}
+                  key: testlib-build-all-${{ env.date }}
                  restore-keys: |
                      testlib-build-all
            - name: ALL/unittests.${{ matrix.type }} UnitTests
@@ -38,6 +47,7 @@ jobs:
        runs-on: [self-hosted, linux, x64]
        container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
        timeout-minutes: 1440 # 24 hours for entire matrix to run
+        needs: get-date
        steps:
            - name: Clean runner
              run: rm -rf ./* || true rm -rf ./.??* || true rm -rf ~/.cache || true
@@ -47,13 +57,13 @@ jobs:
              uses: actions/cache@v4
              with:
                  path: build/NULL
-                  key: testlib-build-null-${{ hashFiles('src/**') }}
+                  key: testlib-build-null-${{ env.date }}

            - name: Restore build/ALL cache
              uses: actions/cache@v4
              with:
                  path: build/ALL
-                  key: testlib-build-all-${{ hashFiles('src/**') }}
+                  key: testlib-build-all-${{ env.date }}

            - name: long ${{ matrix.test-type }} tests
              working-directory: ${{ github.workspace }}/tests
@@ -81,6 +91,7 @@ jobs:
                    gem5-library-example-arm-ubuntu-run-test-ALL-x86_64-opt, gem5-library-example-riscvmatched-hello-ALL-x86_64-opt]
        container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
        timeout-minutes: 1440 # 24 hours
+        needs: get-date
        steps:
            - name: Clean runner
              run: rm -rf ./* || true rm -rf ./.??* || true rm -rf ~/.cache || true
@@ -90,7 +101,7 @@ jobs:
              uses: actions/cache@v4
              with:
                  path: build/ALL
-                  key: testlib-build-all-${{ hashFiles('src/**') }}
+                  key: testlib-build-all-${{ env.date }}
                  restore-keys: |
                      testlib-build-all

@@ -113,6 +124,7 @@ jobs:
        runs-on: [self-hosted, linux, x64]
        container: ghcr.io/gem5/gcn-gpu:latest
        timeout-minutes: 720 # 12 hours
+        needs: get-date

        steps:
            - uses: actions/checkout@v4
@@ -123,7 +135,7 @@ jobs:
              uses: actions/cache@v4
              with:
                  path: build/VEGA_X86
-                  key: testlib-build-vega-${{ hashFiles('src/**') }}
+                  key: testlib-build-vega-${{ env.date }}
                  restore-keys: |
                      testlib-build-vega

--- a/.github/workflows/weekly-tests.yaml
+++ b/.github/workflows/weekly-tests.yaml
@@ -9,6 +9,13 @@ on:

 jobs:

+    get-date:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Get the current date
+              id: date
+              run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+
    # start running the very-long tests
    testlib-very-long-tests:
        strategy:
@@ -18,6 +25,7 @@ jobs:
        runs-on: [self-hosted, linux, x64]
        container: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
        timeout-minutes: 4320 # 3 days
+        needs: get-date
        steps:
            - name: Clean runner
              run: rm -rf ./* || true rm -rf ./.??* || true rm -rf ~/.cache || true
@@ -27,7 +35,7 @@ jobs:
              uses: actions/cache@v4
              with:
                  path: build/ALL
-                  key: testlib-build-all-${{ hashFiles('src/**') }}
+                  key: testlib-build-all-${{ env.date }}
                  restore-keys: |
                      testlib-build-all

@@ -49,6 +57,7 @@ jobs:
        runs-on: [self-hosted, linux, x64]
        container: ghcr.io/gem5/gcn-gpu:latest
        timeout-minutes: 4320 # 3 days
+        needs: get-date

        steps:
            - uses: actions/checkout@v4
@@ -59,7 +68,7 @@ jobs:
              uses: actions/cache@v4
              with:
                  path: build/VEGA_X86
-                  key: testlib-build-vega-${{ hashFiles('src/**') }}
+                  key: testlib-build-vega-${{ env.date }}
                  restore-keys: |
                      testlib-build-vega

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -49,11 +49,11 @@ exclude: |
      tests/.*/ref/.*
    )$

-default_stages: [commit]
+default_stages: [pre-commit]

 repos:
    - repo: https://github.com/pre-commit/pre-commit-hooks
-      rev: v4.5.0
+      rev: v5.0.0
      hooks:
          - id: trailing-whitespace
          - id: end-of-file-fixer
@@ -69,7 +69,7 @@ repos:
          - id: destroyed-symlinks
          - id: requirements-txt-fixer
    - repo: https://github.com/PyCQA/isort
-      rev: 5.11.5
+      rev: 5.13.2
      hooks:
          - id: isort
    - repo: https://github.com/jumanjihouse/pre-commit-hook-yamlfmt
@@ -77,11 +77,11 @@ repos:
      hooks:
          - id: yamlfmt
    - repo: https://github.com/psf/black
-      rev: 23.9.1
+      rev: 24.10.0
      hooks:
          - id: black
    - repo: https://github.com/asottile/pyupgrade
-      rev: v3.14.0
+      rev: v3.17.0
      hooks:
          - id: pyupgrade
            # Python 3.8 is the earliest version supported.
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "python.analysis.extraPaths": [
+        "src/python",
+        "ext",
+        "tests"
+    ]
+}
--- a/configs/common/HMC.py
+++ b/configs/common/HMC.py
@@ -568,9 +568,9 @@ def config_hmc_dev(opt, system, hmc_host):
    # Attach 4 serial link to 4 crossbar/s
    for i in range(opt.num_serial_links):
        if opt.enable_link_monitor:
-            system.hmc_host.seriallink[
-                i
-            ].mem_side_port = system.hmc_dev.lmonitor[i].cpu_side_port
+            system.hmc_host.seriallink[i].mem_side_port = (
+                system.hmc_dev.lmonitor[i].cpu_side_port
+            )
            system.hmc_dev.lmonitor[i].mem_side_port = system.hmc_dev.xbar[
                i
            ].cpu_side_ports
@@ -613,14 +613,12 @@ def config_hmc_dev(opt, system, hmc_host):
                    ]

                    # Connect the bridge between corssbars
-                    system.hmc_dev.xbar[
-                        i
-                    ].mem_side_ports = system.hmc_dev.buffers[
-                        index
-                    ].cpu_side_port
-                    system.hmc_dev.buffers[
-                        index
-                    ].mem_side_port = system.hmc_dev.xbar[j].cpu_side_ports
+                    system.hmc_dev.xbar[i].mem_side_ports = (
+                        system.hmc_dev.buffers[index].cpu_side_port
+                    )
+                    system.hmc_dev.buffers[index].mem_side_port = (
+                        system.hmc_dev.xbar[j].cpu_side_ports
+                    )
                else:
                    # Don't connect the xbar to itself
                    pass
@@ -629,49 +627,49 @@ def config_hmc_dev(opt, system, hmc_host):
    # can only direct traffic to it local vaults
    if opt.arch == "mixed":
        system.hmc_dev.buffer30 = Bridge(ranges=system.mem_ranges[0:4])
-        system.hmc_dev.xbar[
-            3
-        ].mem_side_ports = system.hmc_dev.buffer30.cpu_side_port
+        system.hmc_dev.xbar[3].mem_side_ports = (
+            system.hmc_dev.buffer30.cpu_side_port
+        )
        system.hmc_dev.buffer30.mem_side_port = system.hmc_dev.xbar[
            0
        ].cpu_side_ports

        system.hmc_dev.buffer31 = Bridge(ranges=system.mem_ranges[4:8])
-        system.hmc_dev.xbar[
-            3
-        ].mem_side_ports = system.hmc_dev.buffer31.cpu_side_port
+        system.hmc_dev.xbar[3].mem_side_ports = (
+            system.hmc_dev.buffer31.cpu_side_port
+        )
        system.hmc_dev.buffer31.mem_side_port = system.hmc_dev.xbar[
            1
        ].cpu_side_ports

        system.hmc_dev.buffer32 = Bridge(ranges=system.mem_ranges[8:12])
-        system.hmc_dev.xbar[
-            3
-        ].mem_side_ports = system.hmc_dev.buffer32.cpu_side_port
+        system.hmc_dev.xbar[3].mem_side_ports = (
+            system.hmc_dev.buffer32.cpu_side_port
+        )
        system.hmc_dev.buffer32.mem_side_port = system.hmc_dev.xbar[
            2
        ].cpu_side_ports

        system.hmc_dev.buffer20 = Bridge(ranges=system.mem_ranges[0:4])
-        system.hmc_dev.xbar[
-            2
-        ].mem_side_ports = system.hmc_dev.buffer20.cpu_side_port
+        system.hmc_dev.xbar[2].mem_side_ports = (
+            system.hmc_dev.buffer20.cpu_side_port
+        )
        system.hmc_dev.buffer20.mem_side_port = system.hmc_dev.xbar[
            0
        ].cpu_side_ports

        system.hmc_dev.buffer21 = Bridge(ranges=system.mem_ranges[4:8])
-        system.hmc_dev.xbar[
-            2
-        ].mem_side_ports = system.hmc_dev.buffer21.cpu_side_port
+        system.hmc_dev.xbar[2].mem_side_ports = (
+            system.hmc_dev.buffer21.cpu_side_port
+        )
        system.hmc_dev.buffer21.mem_side_port = system.hmc_dev.xbar[
            1
        ].cpu_side_ports

        system.hmc_dev.buffer23 = Bridge(ranges=system.mem_ranges[12:16])
-        system.hmc_dev.xbar[
-            2
-        ].mem_side_ports = system.hmc_dev.buffer23.cpu_side_port
+        system.hmc_dev.xbar[2].mem_side_ports = (
+            system.hmc_dev.buffer23.cpu_side_port
+        )
        system.hmc_dev.buffer23.mem_side_port = system.hmc_dev.xbar[
            3
        ].cpu_side_ports
--- a/configs/common/Simulation.py
+++ b/configs/common/Simulation.py
@@ -541,9 +541,9 @@ def run(options, root, testsys, cpu_class):
                IndirectBPClass = ObjectList.indirect_bp_list.get(
                    options.indirect_bp_type
                )
-                switch_cpus[
-                    i
-                ].branchPred.indirectBranchPred = IndirectBPClass()
+                switch_cpus[i].branchPred.indirectBranchPred = (
+                    IndirectBPClass()
+                )
            switch_cpus[i].createThreads()

        # If elastic tracing is enabled attach the elastic trace probe
--- a/configs/common/cores/arm/HPI.py
+++ b/configs/common/cores/arm/HPI.py
@@ -1683,6 +1683,15 @@ class HPI_MMU(ArmMMU):
 class HPI_BTB(SimpleBTB):
    numEntries = 128
    tagBits = 18
+    associativity = 1
+    instShiftAmt = 2
+    btbReplPolicy = LRURP()
+    btbIndexingPolicy = BTBSetAssociative(
+        num_entries=Parent.numEntries,
+        set_shift=Parent.instShiftAmt,
+        assoc=Parent.associativity,
+        tag_bits=Parent.tagBits,
+    )


 class HPI_BP(TournamentBP):
--- a/configs/common/cores/arm/O3_ARM_v7a.py
+++ b/configs/common/cores/arm/O3_ARM_v7a.py
@@ -111,6 +111,15 @@ class O3_ARM_v7a_FUP(FUPool):
 class O3_ARM_v7a_BTB(SimpleBTB):
    numEntries = 2048
    tagBits = 18
+    associativity = 1
+    instShiftAmt = 2
+    btbReplPolicy = LRURP()
+    btbIndexingPolicy = BTBSetAssociative(
+        num_entries=Parent.numEntries,
+        set_shift=Parent.instShiftAmt,
+        assoc=Parent.associativity,
+        tag_bits=Parent.tagBits,
+    )


 # Bi-Mode Branch Predictor
--- a/configs/common/cores/arm/ex5_big.py
+++ b/configs/common/cores/arm/ex5_big.py
@@ -108,6 +108,15 @@ class ex5_big_FUP(FUPool):
 class ex5_big_BTB(SimpleBTB):
    numEntries = 4096
    tagBits = 18
+    associativity = 1
+    instShiftAmt = 2
+    btbReplPolicy = LRURP()
+    btbIndexingPolicy = BTBSetAssociative(
+        num_entries=Parent.numEntries,
+        set_shift=Parent.instShiftAmt,
+        assoc=Parent.associativity,
+        tag_bits=Parent.tagBits,
+    )


 # Bi-Mode Branch Predictor
--- a/configs/deprecated/example/fs.py
+++ b/configs/deprecated/example/fs.py
@@ -213,9 +213,9 @@ def build_test_system(np, isa: ISA):
                    IndirectBPClass = ObjectList.indirect_bp_list.get(
                        args.indirect_bp_type
                    )
-                    test_sys.cpu[
-                        i
-                    ].branchPred.indirectBranchPred = IndirectBPClass()
+                    test_sys.cpu[i].branchPred.indirectBranchPred = (
+                        IndirectBPClass()
+                    )
            test_sys.cpu[i].createThreads()

        # If elastic tracing is enabled when not restoring from checkpoint and
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -935,9 +935,9 @@ gpu_port_idx = gpu_port_idx - args.num_cp * 2
 token_port_idx = 0
 for i in range(len(system.ruby._cpu_ports)):
    if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer):
-        system.cpu[shader_idx].CUs[
-            token_port_idx
-        ].gmTokenPort = system.ruby._cpu_ports[i].gmTokenPort
+        system.cpu[shader_idx].CUs[token_port_idx].gmTokenPort = (
+            system.ruby._cpu_ports[i].gmTokenPort
+        )
        token_port_idx += 1

 wavefront_size = args.wf_size
--- a/configs/example/gem5_library/arm-demo-ubuntu-run.py
+++ b/configs/example/gem5_library/arm-demo-ubuntu-run.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+This script further shows an example of booting an ARM based full system Ubuntu
+disk image. This simulation boots the disk image using the ArmDemoBoard.
+
+Usage
+-----
+
+```bash
+scons build/ARM/gem5.opt -j $(nproc)
+./build/ARM/gem5.opt configs/example/gem5_library/arm-demo-ubuntu-run.py
+```
+"""
+import argparse
+
+from gem5.isas import ISA
+from gem5.prebuilt.demo.arm_demo_board import ArmDemoBoard
+from gem5.resources.resource import obtain_resource
+from gem5.simulate.exit_event import ExitEvent
+from gem5.simulate.simulator import Simulator
+from gem5.utils.requires import requires
+
+# This runs a check to ensure the gem5 binary interpreting this file is compiled to include the ARM ISA.
+requires(isa_required=ISA.ARM)
+
+parser = argparse.ArgumentParser(
+    description="An example configuration script to run the ArmDemoBoard."
+)
+
+parser.add_argument(
+    "--use-kvm",
+    action="store_true",
+    help="Use KVM cores instead of Timing.",
+)
+args = parser.parse_args()
+
+board = ArmDemoBoard(use_kvm=args.use_kvm)
+
+board.set_workload(
+    obtain_resource(
+        "arm-ubuntu-24.04-boot-with-systemd", resource_version="2.0.0"
+    )
+)
+
+
+def exit_event_handler():
+    print("First exit: kernel booted")
+    yield False  # gem5 is now executing systemd startup
+    print("Second exit: Started `after_boot.sh` script")
+    # The after_boot.sh script is executed after the kernel and systemd have
+    # booted.
+    yield False  # gem5 is now executing the `after_boot.sh` script
+    print("Third exit: Finished `after_boot.sh` script")
+    # The after_boot.sh script will run a script if it is passed via
+    # m5 readfile. This is the last exit event before the simulation exits.
+    yield True
+
+
+# We define the system with the aforementioned system defined.
+simulator = Simulator(
+    board=board,
+    on_exit_event={
+        ExitEvent.EXIT: exit_event_handler(),
+    },
+)
+
+simulator.run()
--- a/configs/example/gem5_library/riscv-rvv-example.py
+++ b/configs/example/gem5_library/riscv-rvv-example.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2024 Barcelona Supercomputing Center
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+This script demonstrates how to run RISC-V vector-enabled binaries in SE mode
+with gem5. It accepts the number of CORES, VLEN, and ELEN as optional
+parameters, as well as the resource name to run. If no resource name is
+provided, a list of available resources will be displayed. If one is given the
+simulation will then execute the specified resource binary with the selected
+parameters until completion.
+
+
+Usage
+-----
+
+# Compile gem5 for RISC-V
+scons build/RISCV/gem5.opt
+
+# Run the simulation
+./build/RISCV/gem5.opt configs/example/gem5_library/riscv-rvv-example.py \
+    [-c CORES] [-v VLEN] [-e ELEN] <resource>
+
+"""
+
+import argparse
+
+from m5.objects import RiscvO3CPU
+
+from gem5.components.boards.simple_board import SimpleBoard
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
+    PrivateL1PrivateL2CacheHierarchy,
+)
+from gem5.components.memory import SingleChannelDDR3_1600
+from gem5.components.processors.base_cpu_core import BaseCPUCore
+from gem5.components.processors.base_cpu_processor import BaseCPUProcessor
+from gem5.isas import ISA
+from gem5.resources.resource import obtain_resource
+from gem5.simulate.simulator import Simulator
+from gem5.utils.requires import requires
+
+
+class RVVCore(BaseCPUCore):
+    def __init__(self, elen, vlen, cpu_id):
+        super().__init__(core=RiscvO3CPU(cpu_id=cpu_id), isa=ISA.RISCV)
+        self.core.isa[0].elen = elen
+        self.core.isa[0].vlen = vlen
+
+
+requires(isa_required=ISA.RISCV)
+
+resources = [
+    "rvv-branch",
+    "rvv-index",
+    "rvv-matmul",
+    "rvv-memcpy",
+    "rvv-reduce",
+    "rvv-saxpy",
+    "rvv-sgemm",
+    "rvv-strcmp",
+    "rvv-strcpy",
+    "rvv-strlen",
+    "rvv-strlen-fault",
+    "rvv-strncpy",
+]
+
+parser = argparse.ArgumentParser()
+parser.add_argument("resource", type=str, choices=resources)
+parser.add_argument("-c", "--cores", required=False, type=int, default=1)
+parser.add_argument("-v", "--vlen", required=False, type=int, default=256)
+parser.add_argument("-e", "--elen", required=False, type=int, default=64)
+
+args = parser.parse_args()
+
+cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+    l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB"
+)
+
+memory = SingleChannelDDR3_1600()
+
+processor = BaseCPUProcessor(
+    cores=[RVVCore(args.elen, args.vlen, i) for i in range(args.cores)]
+)
+
+board = SimpleBoard(
+    clk_freq="1GHz",
+    processor=processor,
+    memory=memory,
+    cache_hierarchy=cache_hierarchy,
+)
+
+binary = obtain_resource(args.resource)
+board.set_se_binary_workload(binary)
+
+simulator = Simulator(board=board, full_system=False)
+print("Beginning simulation!")
+simulator.run()
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 Advanced Micro Devices, Inc.
+# Copyright (c) 2021-2024 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -82,10 +82,6 @@ def addRunFSOptions(parser):
        help="The second disk image to mount (/dev/sdb)",
    )
    parser.add_argument("--kernel", default=None, help="Linux kernel to boot")
-    parser.add_argument("--gpu-rom", default=None, help="GPU BIOS to load")
-    parser.add_argument(
-        "--gpu-mmio-trace", default=None, help="GPU MMIO trace to load"
-    )
    parser.add_argument(
        "--checkpoint-before-mmios",
        default=False,
@@ -241,16 +237,6 @@ def runGpuFSSystem(args):
        math.ceil(float(n_cu) / args.cu_per_scalar_cache)
    )

-    # Verify MMIO trace is valid. This is only needed for Vega10 simulations.
-    # The md5sum refers to the md5sum of the Vega10 MMIO hardware trace in
-    # the gem5-resources repository. By checking it here, we avoid potential
-    # errors that would cause the driver not to load and simulations to fail.
-    if args.gpu_device == "Vega10":
-        mmio_file = open(args.gpu_mmio_trace, "rb")
-        mmio_md5 = hashlib.md5(mmio_file.read()).hexdigest()
-        if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
-            m5.util.panic("MMIO file does not match gem5 resources")
-
    system = makeGpuFSSystem(args)

    root = Root(
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -176,8 +176,6 @@ def createGPU(system, args):
 def connectGPU(system, args):
    system.pc.south_bridge.gpu = AMDGPUDevice(pci_func=0, pci_dev=8, pci_bus=0)

-    system.pc.south_bridge.gpu.trace_file = args.gpu_mmio_trace
-    system.pc.south_bridge.gpu.rom_binary = args.gpu_rom
    system.pc.south_bridge.gpu.checkpoint_before_mmios = (
        args.checkpoint_before_mmios
    )
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -336,9 +336,9 @@ def makeGpuFSSystem(args):
    token_port_idx = 0
    for i in range(len(system.ruby._cpu_ports)):
        if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer):
-            system.cpu[shader_idx].CUs[
-                token_port_idx
-            ].gmTokenPort = system.ruby._cpu_ports[i].gmTokenPort
+            system.cpu[shader_idx].CUs[token_port_idx].gmTokenPort = (
+                system.ruby._cpu_ports[i].gmTokenPort
+            )
            token_port_idx += 1

    wavefront_size = args.wf_size
@@ -346,9 +346,9 @@ def makeGpuFSSystem(args):
        # The pipeline issues wavefront_size number of uncoalesced requests
        # in one GPU issue cycle. Hence wavefront_size mem ports.
        for j in range(wavefront_size):
-            system.cpu[shader_idx].CUs[i].memory_port[
-                j
-            ] = system.ruby._cpu_ports[gpu_port_idx].in_ports[j]
+            system.cpu[shader_idx].CUs[i].memory_port[j] = (
+                system.ruby._cpu_ports[gpu_port_idx].in_ports[j]
+            )
        gpu_port_idx += 1

    for i in range(args.num_compute_units):
--- a/configs/example/lupv/run_lupv.py
+++ b/configs/example/lupv/run_lupv.py
@@ -110,8 +110,7 @@ board.set_kernel_disk_workload(
 # Begin running of the simulation.
 print("Running with ISA: " + processor.get_isa().name)
 print()
-root = Root(full_system=True, system=board)
-board._pre_instantiate()
+root = board._pre_instantiate()
 m5.instantiate()
 print("Beginning simulation!")

--- a/configs/example/read_config.py
+++ b/configs/example/read_config.py
@@ -250,9 +250,11 @@ class ConfigManager:
                        obj,
                        param_name,
                        [
+                            (
                                self.objects_by_name[name]
                                if name != "Null"
                                else m5.params.NULL
+                            )
                            for name in param_values
                        ],
                    )
--- a/configs/example/ruby_gpu_random_test.py
+++ b/configs/example/ruby_gpu_random_test.py
@@ -371,6 +371,7 @@ for dma_idx in range(n_DMAs):
            num_lanes=1,
            clk_domain=thread_clock,
            deadlock_threshold=tester_deadlock_threshold,
+            cache_line_size=system.cache_line_size,
        )
    )
    g_thread_idx += 1
@@ -393,6 +394,7 @@ for cu_idx in range(n_CUs):
                num_lanes=args.wf_size,
                clk_domain=thread_clock,
                deadlock_threshold=tester_deadlock_threshold,
+                cache_line_size=system.cache_line_size,
            )
        )
        g_thread_idx += 1
--- a/configs/learning_gem5/part3/msi_caches.py
+++ b/configs/learning_gem5/part3/msi_caches.py
@@ -84,6 +84,7 @@ class MyCacheSystem(RubySystem):
                # I/D cache is combined and grab from ctrl
                dcache=self.controllers[i].cacheMemory,
                clk_domain=self.controllers[i].clk_domain,
+                ruby_system=self,
            )
            for i in range(len(cpus))
        ]
@@ -191,7 +192,9 @@ class DirController(Directory_Controller):
        self.version = self.versionCount()
        self.addr_ranges = ranges
        self.ruby_system = ruby_system
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
        # Connect this directory to the memory side.
        self.memory = mem_ctrls[0].port
        self.connectQueues(ruby_system)
--- a/configs/learning_gem5/part3/ruby_caches_MI_example.py
+++ b/configs/learning_gem5/part3/ruby_caches_MI_example.py
@@ -84,6 +84,7 @@ class MyCacheSystem(RubySystem):
                # I/D cache is combined and grab from ctrl
                dcache=self.controllers[i].cacheMemory,
                clk_domain=self.controllers[i].clk_domain,
+                ruby_system=self,
            )
            for i in range(len(cpus))
        ]
@@ -180,7 +181,9 @@ class DirController(Directory_Controller):
        self.version = self.versionCount()
        self.addr_ranges = ranges
        self.ruby_system = ruby_system
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
        # Connect this directory to the memory side.
        self.memory = mem_ctrls[0].port
        self.connectQueues(ruby_system)
--- a/configs/learning_gem5/part3/test_caches.py
+++ b/configs/learning_gem5/part3/test_caches.py
@@ -79,6 +79,7 @@ class TestCacheSystem(RubySystem):
                # I/D cache is combined and grab from ctrl
                dcache=self.controllers[i].cacheMemory,
                clk_domain=self.clk_domain,
+                ruby_system=self,
            )
            for i in range(num_testers)
        ]
--- a/configs/ruby/AMD_Base_Constructor.py
+++ b/configs/ruby/AMD_Base_Constructor.py
@@ -84,14 +84,14 @@ class CPCntrl(AMD_Base_Controller, CntrlBase):
        self.L2cache = L2Cache()
        self.L2cache.create(options.l2_size, options.l2_assoc, options)

-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
        self.sequencer.version = self.seqCount()
        self.sequencer.dcache = self.L1D0cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.coreid = 0
        self.sequencer.is_cpu_sequencer = True

-        self.sequencer1 = RubySequencer()
+        self.sequencer1 = RubySequencer(ruby_system=ruby_system)
        self.sequencer1.version = self.seqCount()
        self.sequencer1.dcache = self.L1D1cache
        self.sequencer1.ruby_system = ruby_system
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -114,14 +114,14 @@ class CPCntrl(CorePair_Controller, CntrlBase):
        self.L2cache = L2Cache()
        self.L2cache.create(options.l2_size, options.l2_assoc, options)

-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
        self.sequencer.version = self.seqCount()
        self.sequencer.dcache = self.L1D0cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.coreid = 0
        self.sequencer.is_cpu_sequencer = True

-        self.sequencer1 = RubySequencer()
+        self.sequencer1 = RubySequencer(ruby_system=ruby_system)
        self.sequencer1.version = self.seqCount()
        self.sequencer1.dcache = self.L1D1cache
        self.sequencer1.ruby_system = ruby_system
@@ -169,7 +169,7 @@ class TCPCntrl(TCP_Controller, CntrlBase):
        # TCP_Controller inherits this from RubyController
        self.mandatory_queue_latency = options.mandatory_queue_latency

-        self.coalescer = VIPERCoalescer()
+        self.coalescer = VIPERCoalescer(ruby_system=ruby_system)
        self.coalescer.version = self.seqCount()
        self.coalescer.icache = self.L1cache
        self.coalescer.dcache = self.L1cache
@@ -182,7 +182,7 @@ class TCPCntrl(TCP_Controller, CntrlBase):
            options.max_coalesces_per_cycle
        )

-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
        self.sequencer.version = self.seqCount()
        self.sequencer.dcache = self.L1cache
        self.sequencer.ruby_system = ruby_system
@@ -211,7 +211,7 @@ class TCPCntrl(TCP_Controller, CntrlBase):
        self.L1cache.create(options)
        self.issue_latency = 1

-        self.coalescer = VIPERCoalescer()
+        self.coalescer = VIPERCoalescer(ruby_system=ruby_system)
        self.coalescer.version = self.seqCount()
        self.coalescer.icache = self.L1cache
        self.coalescer.dcache = self.L1cache
@@ -219,7 +219,7 @@ class TCPCntrl(TCP_Controller, CntrlBase):
        self.coalescer.support_inst_reqs = False
        self.coalescer.is_cpu_sequencer = False

-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
        self.sequencer.version = self.seqCount()
        self.sequencer.dcache = self.L1cache
        self.sequencer.ruby_system = ruby_system
@@ -387,7 +387,9 @@ class DirCntrl(Directory_Controller, CntrlBase):
        self.response_latency = 30

        self.addr_ranges = dir_ranges
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )

        self.L3CacheMemory = L3Cache()
        self.L3CacheMemory.create(options, ruby_system, system)
@@ -686,7 +688,7 @@ def construct_gpudirs(options, system, ruby_system, network):
        dir_cntrl.addr_ranges = dram_intf.range

        # Append
-        exec("system.ruby.gpu_dir_cntrl%d = dir_cntrl" % i)
+        exec("ruby_system.gpu_dir_cntrl%d = dir_cntrl" % i)
        dir_cntrl_nodes.append(dir_cntrl)
        mem_ctrls.append(mem_ctrl)

--- a/configs/ruby/MESI_Three_Level.py
+++ b/configs/ruby/MESI_Three_Level.py
@@ -148,6 +148,7 @@ def create_system(
                train_misses=5,
                num_startup_pfs=4,
                cross_page=True,
+                block_size=options.cacheline_size,
            )

            l0_cntrl = L0Cache_Controller(
--- a/configs/ruby/MESI_Three_Level_HTM.py
+++ b/configs/ruby/MESI_Three_Level_HTM.py
@@ -148,6 +148,7 @@ def create_system(
                train_misses=5,
                num_startup_pfs=4,
                cross_page=True,
+                block_size=options.cacheline_size,
            )

            l0_cntrl = L0Cache_Controller(
--- a/configs/ruby/MESI_Two_Level.py
+++ b/configs/ruby/MESI_Two_Level.py
@@ -94,7 +94,7 @@ def create_system(
            is_icache=False,
        )

-        prefetcher = RubyPrefetcher()
+        prefetcher = RubyPrefetcher(block_size=options.cacheline_size)

        clk_domain = cpus[i].clk_domain

--- a/configs/ruby/MOESI_AMD_Base.py
+++ b/configs/ruby/MOESI_AMD_Base.py
@@ -112,14 +112,14 @@ class CPCntrl(CorePair_Controller, CntrlBase):
        self.L2cache = L2Cache()
        self.L2cache.create(options)

-        self.sequencer = RubySequencer()
+        self.sequencer = RubySequencer(ruby_system=ruby_system)
        self.sequencer.version = self.seqCount()
        self.sequencer.dcache = self.L1D0cache
        self.sequencer.ruby_system = ruby_system
        self.sequencer.coreid = 0
        self.sequencer.is_cpu_sequencer = True

-        self.sequencer1 = RubySequencer()
+        self.sequencer1 = RubySequencer(ruby_system=ruby_system)
        self.sequencer1.version = self.seqCount()
        self.sequencer1.dcache = self.L1D1cache
        self.sequencer1.ruby_system = ruby_system
@@ -194,7 +194,9 @@ class DirCntrl(Directory_Controller, CntrlBase):
        self.response_latency = 30

        self.addr_ranges = dir_ranges
-        self.directory = RubyDirectoryMemory()
+        self.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )

        self.L3CacheMemory = L3Cache()
        self.L3CacheMemory.create(options, ruby_system, system)
--- a/configs/ruby/Ruby.py
+++ b/configs/ruby/Ruby.py
@@ -308,7 +308,9 @@ def create_directories(options, bootmem, ruby_system, system):
    for i in range(options.num_dirs):
        dir_cntrl = Directory_Controller()
        dir_cntrl.version = i
-        dir_cntrl.directory = RubyDirectoryMemory()
+        dir_cntrl.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
        dir_cntrl.ruby_system = ruby_system

        exec("ruby_system.dir_cntrl%d = dir_cntrl" % i)
@@ -316,7 +318,9 @@ def create_directories(options, bootmem, ruby_system, system):

    if bootmem is not None:
        rom_dir_cntrl = Directory_Controller()
-        rom_dir_cntrl.directory = RubyDirectoryMemory()
+        rom_dir_cntrl.directory = RubyDirectoryMemory(
+            block_size=ruby_system.block_size_bytes
+        )
        rom_dir_cntrl.ruby_system = ruby_system
        rom_dir_cntrl.version = i + 1
        rom_dir_cntrl.memory = bootmem.port
--- a/src/arch/amdgpu/vega/operand.hh
+++ b/src/arch/amdgpu/vega/operand.hh
@@ -960,11 +960,14 @@ class PackedReg
        uint64_t elem_mask = (1ULL << ELEM_SIZE) - 1;
        value &= elem_mask;

+        // Clear the bits where the value goes so that operator| can be used.
        elem_mask <<= qw_lbit;
-        qword &= elem_mask;
+        qword &= ~elem_mask;

-        value <<= qw_lbit;
-        qword |= value;
+        // Promote to 64-bit to prevent shifting out of range
+        uint64_t value64 = value;
+        value64 <<= qw_lbit;
+        qword |= value64;

        dwords[udw] = uint32_t(qword >> 32);
        dwords[ldw] = uint32_t(qword & mask(32));
--- a/src/arch/arm/decoder.cc
+++ b/src/arch/arm/decoder.cc
@@ -53,8 +53,6 @@ namespace gem5
 namespace ArmISA
 {

-GenericISA::BasicDecodeCache<Decoder, ExtMachInst> Decoder::defaultCache;
-
 Decoder::Decoder(const ArmDecoderParams &params)
    : InstDecoder(params, &data),
      dvmEnabled(params.dvm_enabled),
--- a/src/arch/arm/decoder.hh
+++ b/src/arch/arm/decoder.hh
@@ -94,7 +94,7 @@ class Decoder : public InstDecoder
    enums::DecoderFlavor decoderFlavor;

    /// A cache of decoded instruction objects.
-    static GenericISA::BasicDecodeCache<Decoder, ExtMachInst> defaultCache;
+    GenericISA::BasicDecodeCache<Decoder, ExtMachInst> defaultCache;
    friend class GenericISA::BasicDecodeCache<Decoder, ExtMachInst>;

    /**
--- a/src/arch/arm/faults.hh
+++ b/src/arch/arm/faults.hh
@@ -264,7 +264,7 @@ class ArmFaultVals : public ArmFault
    static FaultVals vals;

  public:
-    ArmFaultVals<T>(ExtMachInst mach_inst = 0, uint32_t _iss = 0) :
+    ArmFaultVals(ExtMachInst mach_inst = 0, uint32_t _iss = 0) :
        ArmFault(mach_inst, _iss) {}
    FaultName name() const override { return vals.name; }
    FaultOffset offset(ThreadContext *tc) override;
--- a/src/arch/arm/isa/formats/fp.isa
+++ b/src/arch/arm/isa/formats/fp.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-

-// Copyright (c) 2010-2011, 2016-2019 ARM Limited
+// Copyright (c) 2010-2011, 2016-2019, 2024 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -1891,6 +1891,150 @@ let {{
                        return new NVrsqrteD<uint32_t>(machInst, vd, vm);
                    }
                }
+            } else if ((b & 0x1c) == 0x00) {
+                if (bits(b, 1)) {
+                    switch(size) {
+                      case 1:
+                        if (q) {
+                            return new NVcvt2uhAQ<uint16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2uhAD<uint16_t>(machInst, vd, vm);
+                        }
+                      case 2:
+                        if (q) {
+                            return new NVcvt2usAQ<uint32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2usAD<uint32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                } else {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2shAQ<int16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2shAD<int16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2ssAQ<int32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2ssAD<int32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                }
+            } else if ((b & 0x1c) == 0x04) {
+                if (bits(b, 1)) {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2uhNQ<uint16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2uhND<uint16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2usNQ<uint32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2usND<uint32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                } else {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2shNQ<int16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2shND<int16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2ssNQ<int32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2ssND<int32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                }
+            } else if ((b & 0x1c) == 0x08) {
+                if (bits(b, 1)) {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2uhPQ<uint16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2uhPD<uint16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2usPQ<uint32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2usPD<uint32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                } else {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2shPQ<int16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2shPD<int16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2ssPQ<int32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2ssPD<int32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                }
+            } else if ((b & 0x1c) == 0x0c) {
+                if (bits(b, 1)) {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2uhMQ<uint16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2uhMD<uint16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2usMQ<uint32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2usMD<uint32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                } else {
+                    switch (size) {
+                      case 0b01:
+                        if (q) {
+                            return new NVcvt2shMQ<int16_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2shMD<int16_t>(machInst, vd, vm);
+                        }
+                      case 0b10:
+                        if (q) {
+                            return new NVcvt2ssMQ<int32_t>(machInst, vd, vm);
+                        } else {
+                            return new NVcvt2ssMD<int32_t>(machInst, vd, vm);
+                        }
+                      default:
+                        return new Unknown(machInst);
+                    }
+                }
            } else {
                return new Unknown(machInst);
            }
--- a/src/arch/arm/isa/insts/neon.isa
+++ b/src/arch/arm/isa/insts/neon.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-

-// Copyright (c) 2010-2011, 2015, 2019 ARM Limited
+// Copyright (c) 2010-2011, 2015, 2019, 2024 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -3579,6 +3579,128 @@ let {{
    '''
    twoRegLongMiscInst("vcvt", "NVcvth2s", "SimdCvtOp", ("uint16_t",), vcvth2sCode)

+    vcvthp2hCode = '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
+        float mid = vcvtFpHFpS(fpscr, fpscr.dn, fpscr.ahp, srcElem1);
+        if (flushToZero(mid))
+            fpscr.idc = 1;
+        destElem = vfpFpToFixed<float>(mid, %s, 16, 0, true, %s);
+        __asm__ __volatile__("" :: "m" (destElem));
+        finishVfp(fpscr, state, true);
+        FpscrExc = fpscr;
+    '''
+
+    vcvtahp2uhCode = vcvthp2hCode % ("false", "VfpRoundAway")
+    twoRegMiscInst("vcvta.u16.f16", "NVcvt2uhAD", "SimdCvtOp",
+                   ("uint16_t",), 2, vcvtahp2uhCode)
+    twoRegMiscInst("vcvta.u16.f16", "NVcvt2uhAQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vcvtahp2uhCode)
+
+    vcvtnhp2uhCode = vcvthp2hCode % ("false", "VfpRoundNearest")
+    twoRegMiscInst("vcvtn.u16.f16", "NVcvt2uhND", "SimdCvtOp",
+                   ("uint16_t",), 2, vcvtnhp2uhCode)
+    twoRegMiscInst("vcvtn.u16.f16", "NVcvt2uhNQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vcvtnhp2uhCode)
+
+    vcvtphp2uhCode = vcvthp2hCode % ("false", "VfpRoundUpward")
+    twoRegMiscInst("vcvtp.u16.f16", "NVcvt2uhPD", "SimdCvtOp",
+                   ("uint16_t",), 2, vcvtphp2uhCode)
+    twoRegMiscInst("vcvtp.u16.f16", "NVcvt2uhPQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vcvtphp2uhCode)
+
+    vcvtmhp2uhCode = vcvthp2hCode % ("false", "VfpRoundDown")
+    twoRegMiscInst("vcvtm.u16.f16", "NVcvt2uhMD", "SimdCvtOp",
+                   ("uint16_t",), 2, vcvtmhp2uhCode)
+    twoRegMiscInst("vcvtm.u16.f16", "NVcvt2uhMQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vcvtmhp2uhCode)
+
+    vcvtahp2shCode = vcvthp2hCode % ("true", "VfpRoundAway")
+    twoRegMiscInst("vcvta.s16.f16", "NVcvt2shAD", "SimdCvtOp",
+                   ("int16_t",), 2, vcvtahp2shCode)
+    twoRegMiscInst("vcvta.s16.f16", "NVcvt2shAQ", "SimdCvtOp",
+                   ("int16_t",), 4, vcvtahp2shCode)
+
+    vcvtnhp2shCode = vcvthp2hCode % ("true", "VfpRoundNearest")
+    twoRegMiscInst("vcvtn.s16.f16", "NVcvt2shND", "SimdCvtOp",
+                   ("int16_t",), 2, vcvtnhp2shCode)
+    twoRegMiscInst("vcvtn.s16.f16", "NVcvt2shNQ", "SimdCvtOp",
+                   ("int16_t",), 4, vcvtnhp2shCode)
+
+    vcvtphp2shCode = vcvthp2hCode % ("true", "VfpRoundUpward")
+    twoRegMiscInst("vcvtp.s16.f16", "NVcvt2shPD", "SimdCvtOp",
+                   ("int16_t",), 2, vcvtphp2shCode)
+    twoRegMiscInst("vcvtp.s16.f16", "NVcvt2shPQ", "SimdCvtOp",
+                   ("int16_t",), 4, vcvtphp2shCode)
+
+    vcvtmhp2shCode = vcvthp2hCode % ("true", "VfpRoundDown")
+    twoRegMiscInst("vcvtm.s16.f16", "NVcvt2shMD", "SimdCvtOp",
+                   ("int16_t",), 2, vcvtmhp2shCode)
+    twoRegMiscInst("vcvtm.s16.f16", "NVcvt2shMQ", "SimdCvtOp",
+                   ("int16_t",), 4, vcvtmhp2shCode)
+
+    vcvtsp2sCode = '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
+        float mid = bitsToFp(srcElem1, (float)0.0);
+        if (flushToZero(mid))
+            fpscr.idc = 1;
+        destElem = vfpFpToFixed<float>(mid, %s, 32, 0, true, %s);
+        __asm__ __volatile__("" :: "m" (destElem));
+        finishVfp(fpscr, state, true);
+        FpscrExc = fpscr;
+    '''
+
+    vcvtasp2usCode = vcvtsp2sCode % ("false", "VfpRoundAway")
+    twoRegMiscInst("vcvta.u32.f32", "NVcvt2usAD", "SimdCvtOp",
+                   ("uint32_t",), 2, vcvtasp2usCode)
+    twoRegMiscInst("vcvta.u32.f32", "NVcvt2usAQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vcvtasp2usCode)
+
+    vcvtnsp2usCode = vcvtsp2sCode % ("false", "VfpRoundNearest")
+    twoRegMiscInst("vcvtn.u32.f32", "NVcvt2usND", "SimdCvtOp",
+                   ("uint32_t",), 2, vcvtnsp2usCode)
+    twoRegMiscInst("vcvtn.u32.f32", "NVcvt2usNQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vcvtnsp2usCode)
+
+    vcvtpsp2usCode = vcvtsp2sCode % ("false", "VfpRoundUpward")
+    twoRegMiscInst("vcvtp.u32.f32", "NVcvt2usPD", "SimdCvtOp",
+                   ("uint32_t",), 2, vcvtpsp2usCode)
+    twoRegMiscInst("vcvtp.u32.f32", "NVcvt2usPQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vcvtpsp2usCode)
+
+    vcvtmsp2usCode = vcvtsp2sCode % ("false", "VfpRoundDown")
+    twoRegMiscInst("vcvtm.u32.f32", "NVcvt2usMD", "SimdCvtOp",
+                   ("uint32_t",), 2, vcvtmsp2usCode)
+    twoRegMiscInst("vcvtm.u32.f32", "NVcvt2usMQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vcvtmsp2usCode)
+
+    vcvtasp2ssCode = vcvtsp2sCode % ("true", "VfpRoundAway")
+    twoRegMiscInst("vcvta.s32.f32", "NVcvt2ssAD", "SimdCvtOp",
+                   ("int32_t",), 2, vcvtasp2ssCode)
+    twoRegMiscInst("vcvta.s32.f32", "NVcvt2ssAQ", "SimdCvtOp",
+                   ("int32_t",), 4, vcvtasp2ssCode)
+
+    vcvtnsp2ssCode = vcvtsp2sCode % ("true", "VfpRoundNearest")
+    twoRegMiscInst("vcvtn.s32.f32", "NVcvt2ssND", "SimdCvtOp",
+                   ("int32_t",), 2, vcvtnsp2ssCode)
+    twoRegMiscInst("vcvtn.s32.f32", "NVcvt2ssNQ", "SimdCvtOp",
+                   ("int32_t",), 4, vcvtnsp2ssCode)
+
+    vcvtpsp2ssCode = vcvtsp2sCode % ("true", "VfpRoundUpward")
+    twoRegMiscInst("vcvtp.s32.f32", "NVcvt2ssPD", "SimdCvtOp",
+                   ("int32_t",), 2, vcvtpsp2ssCode)
+    twoRegMiscInst("vcvtp.s32.f32", "NVcvt2ssPQ", "SimdCvtOp",
+                   ("int32_t",), 4, vcvtpsp2ssCode)
+
+    vcvtmsp2ssCode = vcvtsp2sCode % ("true", "VfpRoundDown")
+    twoRegMiscInst("vcvtm.s32.f32", "NVcvt2ssMD", "SimdCvtOp",
+                   ("int32_t",), 2, vcvtmsp2ssCode)
+    twoRegMiscInst("vcvtm.s32.f32", "NVcvt2ssMQ", "SimdCvtOp",
+                   ("int32_t",), 4, vcvtmsp2ssCode)
+
    vrsqrteCode = '''
        destElem = unsignedRSqrtEstimate(srcElem1);
    '''
--- a/src/arch/generic/interrupts.hh
+++ b/src/arch/generic/interrupts.hh
@@ -89,6 +89,12 @@ class BaseInterrupts : public SimObject
    {
        panic("Interrupts::clearAll unimplemented!\n");
    }
+
+    virtual bool
+    isWakeUp() const
+    {
+        return true;
+    }
 };

 } // namespace gem5
--- a/src/arch/isa_parser/isa_parser.py
+++ b/src/arch/isa_parser/isa_parser.py
@@ -111,12 +111,13 @@ class Template:

            operands = SubOperandList(self.parser, compositeCode, d.operands)

-            myDict[
-                "reg_idx_arr_decl"
-            ] = "RegId srcRegIdxArr[%d]; RegId destRegIdxArr[%d]" % (
+            myDict["reg_idx_arr_decl"] = (
+                "RegId srcRegIdxArr[%d]; RegId destRegIdxArr[%d]"
+                % (
                    d.operands.numSrcRegs + d.srcRegIdxPadding,
                    d.operands.numDestRegs + d.destRegIdxPadding,
                )
+            )

            # The reinterpret casts are largely because an array with a known
            # size cannot be passed as an argument which is an array with an
@@ -821,7 +822,7 @@ class ISAParser(Grammar):
        "DBLCOLON",
        "ASTERISK",
        # C preprocessor directives
-        "CPPDIRECTIVE"
+        "CPPDIRECTIVE",
        # The following are matched but never returned. commented out to
        # suppress PLY warning
        # newfile directive
--- a/src/arch/micro_asm.py
+++ b/src/arch/micro_asm.py
@@ -140,9 +140,9 @@ def handle_statement(parser, container, statement):
    if statement.is_microop:
        if statement.mnemonic not in parser.microops.keys():
            raise Exception(f"Unrecognized mnemonic: {statement.mnemonic}")
-        parser.symbols[
-            "__microopClassFromInsideTheAssembler"
-        ] = parser.microops[statement.mnemonic]
+        parser.symbols["__microopClassFromInsideTheAssembler"] = (
+            parser.microops[statement.mnemonic]
+        )
        try:
            microop = eval(
                f"__microopClassFromInsideTheAssembler({statement.params})",
@@ -166,9 +166,9 @@ def handle_statement(parser, container, statement):
    elif statement.is_directive:
        if statement.name not in container.directives.keys():
            raise Exception(f"Unrecognized directive: {statement.name}")
-        parser.symbols[
-            "__directiveFunctionFromInsideTheAssembler"
-        ] = container.directives[statement.name]
+        parser.symbols["__directiveFunctionFromInsideTheAssembler"] = (
+            container.directives[statement.name]
+        )
        try:
            eval(
                f"__directiveFunctionFromInsideTheAssembler({statement.params})",
--- a/src/arch/riscv/RiscvISA.py
+++ b/src/arch/riscv/RiscvISA.py
@@ -114,6 +114,13 @@ class RiscvISA(BaseISA):

    enable_Zicbom_fs = Param.Bool(True, "Enable Zicbom extension in FS mode")
    enable_Zicboz_fs = Param.Bool(True, "Enable Zicboz extension in FS mode")
+    enable_Zcd = Param.Bool(
+        True,
+        "Enable Zcd extensions. "
+        "Set the option to false implies the Zcmp and Zcmt is enable as "
+        "c.fsdsp is overlap with them."
+        "Refs: https://github.com/riscv/riscv-isa-manual/blob/main/src/zc.adoc",
+    )

    wfi_resume_on_pending = Param.Bool(
        False,
--- a/src/arch/riscv/decoder.cc
+++ b/src/arch/riscv/decoder.cc
@@ -44,6 +44,7 @@ Decoder::Decoder(const RiscvDecoderParams &p) : InstDecoder(p, &machInst)
    ISA *isa = dynamic_cast<ISA*>(p.isa);
    vlen = isa->getVecLenInBits();
    elen = isa->getVecElemLenInBits();
+    _enableZcd = isa->enableZcd();
    reset();
 }

@@ -127,6 +128,7 @@ Decoder::decode(PCStateBase &_next_pc)
    emi.vtype8  = next_pc.vtype() & 0xff;
    emi.vill    = next_pc.vtype().vill;
    emi.rv_type = static_cast<int>(next_pc.rvType());
+    emi.enable_zcd = _enableZcd;

    return decode(emi, next_pc.instAddr());
 }
--- a/src/arch/riscv/decoder.hh
+++ b/src/arch/riscv/decoder.hh
@@ -62,6 +62,7 @@ class Decoder : public InstDecoder

    uint32_t vlen;
    uint32_t elen;
+    bool _enableZcd;

    virtual StaticInstPtr decodeInst(ExtMachInst mach_inst);

--- a/src/arch/riscv/insts/SConscript
+++ b/src/arch/riscv/insts/SConscript
@@ -34,3 +34,4 @@ Source('mem.cc', tags='riscv isa')
 Source('standard.cc', tags='riscv isa')
 Source('static_inst.cc', tags='riscv isa')
 Source('vector.cc', tags='riscv isa')
+Source('zcmp.cc', tags='riscv isa')
--- a/src/arch/riscv/insts/zcmp.cc
+++ b/src/arch/riscv/insts/zcmp.cc
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 Google LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/riscv/insts/zcmp.hh"
+
+#include <string>
+
+#include "arch/riscv/regs/int.hh"
+#include "arch/riscv/utility.hh"
+
+namespace gem5
+{
+
+namespace RiscvISA
+{
+
+CmMacroInst::CmMacroInst(
+    const char* mnem, ExtMachInst machInst, OpClass opClass)
+    : RiscvMacroInst(mnem, machInst, opClass), rlist(machInst.rlist)
+{
+}
+
+// Ref: https://github.com/riscv-software-src/riscv-isa-sim/blob/f7d0dba60/
+//      riscv/decode.h#L168
+uint64_t
+CmMacroInst::stackAdj() const
+{
+    uint64_t stack_adj_base = 0;
+    switch (machInst.rlist) {
+      case 15:
+        stack_adj_base += 16;
+        [[fallthrough]];
+      case 14:
+        if (machInst.rv_type == RV64) {
+            stack_adj_base += 16;
+        }
+        [[fallthrough]];
+      case 13:
+      case 12:
+        stack_adj_base += 16;
+        [[fallthrough]];
+      case 11:
+      case 10:
+        if (machInst.rv_type == RV64) {
+            stack_adj_base += 16;
+        }
+        [[fallthrough]];
+      case 9:
+      case 8:
+        stack_adj_base += 16;
+        [[fallthrough]];
+      case 7:
+      case 6:
+        if (machInst.rv_type == RV64) {
+            stack_adj_base += 16;
+        }
+        [[fallthrough]];
+      case 5:
+      case 4:
+        stack_adj_base += 16;
+        break;
+    }
+
+    return stack_adj_base + machInst.spimm * 16;
+}
+
+std::string
+CmMacroInst::getRlistStr() const
+{
+    std::string s = "";
+    switch (machInst.rlist) {
+      case 15:
+        s = csprintf("{%s, %s-%s}", registerName(ReturnAddrReg),
+                     registerName(int_reg::S0),
+                     registerName(PushPopRegList[0]));
+        break;
+      case 14:
+      case 13:
+      case 12:
+      case 11:
+      case 10:
+      case 9:
+      case 8:
+      case 7:
+      case 6:
+        s = csprintf("{%s, %s-%s}", registerName(ReturnAddrReg),
+                     registerName(int_reg::S0),
+                     registerName(PushPopRegList[16-machInst.rlist]));
+        break;
+      case 5:
+        s = csprintf("{%s, %s}", registerName(ReturnAddrReg),
+                     registerName(int_reg::S0));
+        break;
+      case 4:
+        s = csprintf("{%s}", registerName(ReturnAddrReg));
+        break;
+      default:
+        break;
+    }
+
+    return s;
+}
+
+} // namespace RiscvISA
+} // namespace gem5
--- a/src/arch/riscv/insts/zcmp.hh
+++ b/src/arch/riscv/insts/zcmp.hh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 Google LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_RISCV_INSTS_ZCMP_HH__
+#define __ARCH_RISCV_INSTS_ZCMP_HH__
+
+#include <string>
+
+#include "arch/riscv/insts/static_inst.hh"
+#include "cpu/static_inst.hh"
+
+namespace gem5
+{
+
+namespace RiscvISA
+{
+
+class CmMacroInst : public RiscvMacroInst
+{
+  public:
+    CmMacroInst(const char* mnem, ExtMachInst machInst, OpClass opClass);
+
+  protected:
+    using RiscvMacroInst::RiscvMacroInst;
+
+    uint64_t stackAdj() const;
+    std::string getRlistStr() const;
+
+    uint64_t rlist;
+};
+
+} // namespace RiscvISA
+} // namespace gem5
+
+#endif // __ARCH_RISCV_INSTS_ZCMP_HH__
--- a/src/arch/riscv/interrupts.hh
+++ b/src/arch/riscv/interrupts.hh
@@ -95,6 +95,11 @@ class Interrupts : public BaseInterrupts

    void clearAll() override;

+    bool isWakeUp() const override
+    {
+        return checkNonMaskableInterrupt() || (ip & ie).any();
+    }
+
    uint64_t readIP() const { return (uint64_t)ip.to_ulong(); }
    uint64_t readIE() const { return (uint64_t)ie.to_ulong(); }
    void setIP(const uint64_t& val) { ip = val; }
--- a/src/arch/riscv/isa.cc
+++ b/src/arch/riscv/isa.cc
@@ -260,7 +260,7 @@ RegClass ccRegClass(CCRegClass, CCRegClassName, 0, debug::IntRegs);
 ISA::ISA(const Params &p) : BaseISA(p, "riscv"),
    _rvType(p.riscv_type), enableRvv(p.enable_rvv), vlen(p.vlen), elen(p.elen),
    _privilegeModeSet(p.privilege_mode_set),
-    _wfiResumeOnPending(p.wfi_resume_on_pending)
+    _wfiResumeOnPending(p.wfi_resume_on_pending), _enableZcd(p.enable_Zcd)
 {
    _regClasses.push_back(&intRegClass);
    _regClasses.push_back(&floatRegClass);
--- a/src/arch/riscv/isa.hh
+++ b/src/arch/riscv/isa.hh
@@ -108,6 +108,14 @@ class ISA : public BaseISA
    */
    const bool _wfiResumeOnPending;

+    /**
+     * Enable Zcd extensions.
+     * Set the option to false implies the Zcmp and Zcmt is enable as c.fsdsp
+     * is overlap with them.
+     * Refs: https://github.com/riscv/riscv-isa-manual/blob/main/src/zc.adoc
+     */
+    bool _enableZcd;
+
  public:
    using Params = RiscvISAParams;

@@ -184,6 +192,8 @@ class ISA : public BaseISA

    bool resumeOnPending() { return _wfiResumeOnPending; }

+    bool enableZcd() { return _enableZcd; }
+
    virtual Addr getFaultHandlerAddr(
        RegIndex idx, uint64_t cause, bool intr) const;
 };
--- a/src/arch/riscv/isa/bitfields.isa
+++ b/src/arch/riscv/isa/bitfields.isa
@@ -34,6 +34,7 @@
 // Bitfield definitions.
 //
 def bitfield RVTYPE rv_type;
+def bitfield ENABLE_ZCD enable_zcd;

 def bitfield QUADRANT <1:0>;
 def bitfield OPCODE5 <6:2>;
@@ -103,10 +104,13 @@ def bitfield CFUNCT1 <12>;
 def bitfield CFUNCT1BIT6 <6>;
 def bitfield CFUNCT2HIGH <11:10>;
 def bitfield CFUNCT2LOW <6:5>;
+def bitfield CFUNCT2MID <9:8>;
 def bitfield RC1 <11:7>;
 def bitfield RC2 <6:2>;
 def bitfield RP1 <9:7>;
 def bitfield RP2 <4:2>;
+def bitfield R1S <9:7>;
+def bitfield R2S <4:2>;
 def bitfield FC1 <11:7>;
 def bitfield FC2 <6:2>;
 def bitfield FP2 <4:2>;
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -54,6 +54,7 @@ decode QUADRANT default Unknown::unknown() {
            Rp2 = rvSext(sp + imm);
        }}, uint64_t);
        format CompressedLoad {
+            0x1: decode ENABLE_ZCD {
                0x1: c_fld({{
                    offset = CIMM3 << 3 | CIMM2 << 6;
                }}, {{
@@ -71,6 +72,7 @@ decode QUADRANT default Unknown::unknown() {
                }}, {{
                    EA = rvSext(Rp1 + offset);
                }});
+            }
            0x2: c_lw({{
                offset = CIMM2<1:1> << 2 |
                         CIMM3 << 3 |
@@ -152,7 +154,8 @@ decode QUADRANT default Unknown::unknown() {
            }
        }
        format CompressedStore {
-            0x5: c_fsd({{
+            0x5: decode ENABLE_ZCD {
+                0x1: c_fsd({{
                    offset = CIMM3 << 3 | CIMM2 << 6;
                }}, {{
                    STATUS status = xc->readMiscReg(MISCREG_STATUS);
@@ -164,6 +167,7 @@ decode QUADRANT default Unknown::unknown() {
                }}, {{
                    EA = rvSext(Rp1 + offset);
                }});
+            }
            0x6: c_sw({{
                offset = CIMM2<1:1> << 2 |
                         CIMM3 << 3 |
@@ -381,6 +385,7 @@ decode QUADRANT default Unknown::unknown() {
            Rc1 = rvSext(Rc1 << imm);
        }}, uint64_t);
        format CompressedLoad {
+            0x1: decode ENABLE_ZCD {
                0x1: c_fldsp({{
                    offset = CIMM5<4:3> << 3 |
                             CIMM1 << 5 |
@@ -398,6 +403,7 @@ decode QUADRANT default Unknown::unknown() {
                }}, {{
                    EA = rvSext(sp + offset);
                }});
+            }
            0x2: c_lwsp({{
                offset = CIMM5<4:2> << 2 |
                         CIMM1 << 5 |
@@ -480,7 +486,22 @@ decode QUADRANT default Unknown::unknown() {
            }
        }
        format CompressedStore {
-            0x5: c_fsdsp({{
+            0x5: decode ENABLE_ZCD {
+                0x0: decode CFUNCT6LOW3 {
+                    0x3: decode CFUNCT2LOW {
+                        0x1: CmMvsa01::cm_mvsa01();
+                        0x3: CmMva01s::cm_mva01s();
+                    }
+                    0x6: decode CFUNCT2MID {
+                        0x0: CmPush::cm_push();
+                        0x2: CmPop::cm_pop();
+                    }
+                    0x7: decode CFUNCT2MID {
+                        0x0: CmPop::cm_popretz(is_ret=True, has_a0=True);
+                        0x2: CmPop::cm_popret(is_ret=True);
+                    }
+                }
+                0x1: c_fsdsp({{
                    offset = CIMM6<5:3> << 3 |
                             CIMM6<2:0> << 6;
                }}, {{
@@ -493,6 +514,7 @@ decode QUADRANT default Unknown::unknown() {
                }}, {{
                    EA = rvSext(sp + offset);
                }});
+            }
            0x6: c_swsp({{
                offset = CIMM6<5:2> << 2 |
                         CIMM6<1:0> << 6;
--- a/src/arch/riscv/isa/formats/formats.isa
+++ b/src/arch/riscv/isa/formats/formats.isa
@@ -40,6 +40,7 @@
 ##include "vector_conf.isa"
 ##include "vector_arith.isa"
 ##include "vector_mem.isa"
+##include "zcmp.isa"

 // Include formats for nonstandard extensions
 ##include "compressed.isa"
--- a/src/arch/riscv/isa/formats/zcmp.isa
+++ b/src/arch/riscv/isa/formats/zcmp.isa
@@ -0,0 +1,782 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2015 RISC-V Foundation
+// Copyright (c) 2016 The University of Virginia
+// Copyright (c) 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Cmpush template.
+def template CmPushDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst);
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+    };
+}};
+
+
+def template CmPushConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst) :
+      %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        StaticInstPtr cur_inst = nullptr;
+        if (rlist < 4) {
+            cur_inst = new Unknown(machInst);
+            cur_inst->setFlag(IsMicroop);
+            cur_inst->setDelayedCommit();
+            microops.emplace_back(cur_inst);
+        } else {
+            int start_reg = 0;
+            if (rlist != 15) {
+                start_reg = (16-rlist);
+            }
+
+            int offset = 0;
+            for (int i = start_reg; i < PushPopRegList.size(); i++) {
+                offset -= rvSelect(4, 8);
+
+                if (machInst.rv_type == RV32) {
+                    cur_inst = new %(class_name)s32MicroInst(
+                        machInst, PushPopRegList[i], offset);
+                } else {
+                    cur_inst = new %(class_name)s64MicroInst(
+                        machInst, PushPopRegList[i], offset);
+                }
+                cur_inst->setDelayedCommit();
+                microops.emplace_back(cur_inst);
+            }
+
+            cur_inst = new %(class_name)sSpAdjMicroInst(machInst, -stackAdj());
+            cur_inst->setDelayedCommit();
+            microops.emplace_back(cur_inst);
+        }
+
+        microops.front()->setFirstMicroop();
+        microops.back()->setLastMicroop();
+    }
+}};
+
+def template CmPushExecute {{
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << getRlistStr() << ", " << (int64_t)-stackAdj();
+        return ss.str();
+    }
+}};
+
+def template CmStoreMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, RegId push_reg, int64_t offset);
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+        Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+        Fault completeAcc(
+            Packet *, ExecContext *, trace::InstRecord *) const override;
+        std::string generateDisassembly(
+            Addr, const loader::SymbolTable *) const override;
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+      private:
+        %(reg_idx_arr_decl)s;
+
+        int64_t offset;
+        Request::Flags memAccessFlags;
+    };
+}};
+
+def template CmStoreMicroConstructor {{
+    %(class_name)s::%(class_name)s(
+        ExtMachInst machInst, RegId push_reg, int64_t offset)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s),
+        offset(offset)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template CmStoreMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        Addr EA;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        %(memacc_code)s;
+
+        {
+            Fault fault =
+                writeMemAtomicLE(xc, traceData, Mem, EA, memAccessFlags,
+                        nullptr);
+            if (fault != NoFault)
+                return fault;
+        }
+
+        %(op_wb)s;
+
+        return NoFault;
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(srcRegIdx(1)) << ", " <<
+            offset << '(' << registerName(srcRegIdx(0)) << ')';
+        return ss.str();
+    }
+}};
+
+def template CmStoreMicroInitiateAcc {{
+    Fault
+    %(class_name)s::initiateAcc(ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        Addr EA;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        %(memacc_code)s;
+
+        {
+            Fault fault = writeMemTimingLE(xc, traceData, Mem, EA,
+                memAccessFlags, nullptr);
+            if (fault != NoFault)
+                return fault;
+        }
+
+        %(op_wb)s;
+
+        return NoFault;
+    }
+}};
+
+def template CmStoreMicroCompleteAcc {{
+    Fault
+    %(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        return NoFault;
+    }
+}};
+
+def template SpAdjMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, int64_t adj);
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+        std::string generateDisassembly(
+            Addr, const loader::SymbolTable *) const override;
+
+      private:
+        %(reg_idx_arr_decl)s;
+
+        int64_t adj;
+    };
+}};
+
+def template SpAdjMicroConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst, int64_t adj)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s), adj(adj)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template SpAdjMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        %(op_decl)s;
+        %(op_rd)s;
+        %(code)s;
+        %(op_wb)s;
+        return NoFault;
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ' '
+            << registerName(srcRegIdx(0)) << ' ' << adj;
+        return ss.str();
+    }
+}};
+
+// Cmpop decode template.
+def template CmPopDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst);
+
+        std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+    };
+}};
+
+
+def template CmPopConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst) :
+      %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        StaticInstPtr cur_inst = nullptr;
+        if (rlist < 4) {
+            cur_inst = new Unknown(machInst);
+            cur_inst->setFlag(IsMicroop);
+            cur_inst->setDelayedCommit();
+            microops.emplace_back(cur_inst);
+        } else {
+            int start_reg = 0;
+            if (rlist != 15) {
+                start_reg = (16-rlist);
+            }
+
+            int offset = stackAdj();
+            for (int i = start_reg; i < PushPopRegList.size(); i++) {
+                offset -= rvSelect(4, 8);
+
+                if (machInst.rv_type == RV32) {
+                    cur_inst = new %(class_name)s32MicroInst(
+                        machInst, PushPopRegList[i], offset);
+                } else {
+                    cur_inst = new %(class_name)s64MicroInst(
+                        machInst, PushPopRegList[i], offset);
+                }
+                cur_inst->setDelayedCommit();
+                microops.emplace_back(cur_inst);
+            }
+
+            cur_inst = new %(class_name)sSpAdjMicroInst(machInst, stackAdj());
+            cur_inst->setDelayedCommit();
+            microops.emplace_back(cur_inst);
+
+            %(move_a0_desc)s;
+            %(return_desc)s;
+        }
+
+        microops.front()->setFirstMicroop();
+        microops.back()->setLastMicroop();
+    }
+}};
+
+def template CmPopExecute {{
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << getRlistStr() << ", " << stackAdj();
+        return ss.str();
+    }
+}};
+
+def template CmLoadMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, RegId pop_reg, int64_t offset);
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+        Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+        Fault completeAcc(
+            Packet *, ExecContext *, trace::InstRecord *) const override;
+        std::string generateDisassembly(
+            Addr, const loader::SymbolTable *) const override;
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+      private:
+        %(reg_idx_arr_decl)s;
+
+        int64_t offset;
+        Request::Flags memAccessFlags;
+    };
+}};
+
+def template CmLoadMicroConstructor {{
+    %(class_name)s::%(class_name)s(
+        ExtMachInst machInst, RegId pop_reg, int64_t offset)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s),
+        offset(offset)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template CmLoadMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        Addr EA;
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        {
+            Fault fault =
+                readMemAtomicLE(xc, traceData, EA, Mem, memAccessFlags);
+            if (fault != NoFault)
+                return fault;
+        }
+
+        %(memacc_code)s;
+
+        %(op_wb)s;
+
+        return NoFault;
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+            offset << '(' << registerName(srcRegIdx(0)) << ')';
+        return ss.str();
+    }
+}};
+
+def template CmLoadMicroInitiateAcc {{
+    Fault
+    %(class_name)s::initiateAcc(ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        Addr EA;
+
+        %(op_src_decl)s;
+        %(op_rd)s;
+        %(ea_code)s;
+
+        return initiateMemRead(xc, traceData, EA, Mem, memAccessFlags);
+    }
+}};
+
+def template CmLoadMicroCompleteAcc {{
+    Fault
+    %(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        %(op_decl)s;
+        %(op_rd)s;
+
+        getMemLE(pkt, Mem, traceData);
+
+        %(memacc_code)s;
+        %(op_wb)s;
+
+        return NoFault;
+    }
+}};
+
+def template CmRetMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        /// Constructor.
+        %(class_name)s(ExtMachInst machInst);
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+
+        std::string
+        generateDisassembly(
+                Addr pc, const loader::SymbolTable *symtab) const override;
+
+        std::unique_ptr<PCStateBase> branchTarget(
+                ThreadContext *tc) const override;
+
+        using StaticInst::branchTarget;
+
+      private:
+        %(reg_idx_arr_decl)s;
+    };
+}};
+
+def template CmRetMicroConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template CmRetMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        %(op_decl)s;
+        %(op_rd)s;
+        %(code)s;
+        %(op_wb)s;
+        return NoFault;
+    }
+
+    std::unique_ptr<PCStateBase>
+    %(class_name)s::branchTarget(ThreadContext *tc) const
+    {
+        PCStateBase *pc_ptr = tc->pcState().clone();
+        pc_ptr->as<PCState>().set(rvSext(tc->getReg(srcRegIdx(0)) & ~0x1));
+        return std::unique_ptr<PCStateBase>{pc_ptr};
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(srcRegIdx(0));
+        return ss.str();
+    }
+}};
+
+// Cmmvsa01 decode template
+def template CmMvDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst);
+
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+    };
+}};
+
+def template CmMvsa01Constructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        StaticInstPtr cur_inst;
+        cur_inst = new %(class_name)sMvMicroInst(
+            machInst, int_reg::A0, StackRegs[machInst.r1s]);
+        microops.emplace_back(cur_inst);
+        cur_inst = new %(class_name)sMvMicroInst(
+            machInst, int_reg::A1, StackRegs[machInst.r2s]);
+        microops.emplace_back(cur_inst);
+
+        microops.front()->setFirstMicroop();
+        microops.back()->setLastMicroop();
+    }
+}};
+
+def template CmMva01sConstructor {{
+    %(class_name)s::%(class_name)s(ExtMachInst machInst)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        StaticInstPtr cur_inst;
+        cur_inst = new %(class_name)sMvMicroInst(
+            machInst, StackRegs[machInst.r1s], int_reg::A0);
+        cur_inst->setDelayedCommit();
+        microops.emplace_back(cur_inst);
+        cur_inst = new %(class_name)sMvMicroInst(
+            machInst, StackRegs[machInst.r2s], int_reg::A1);
+        cur_inst->setDelayedCommit();
+        microops.emplace_back(cur_inst);
+
+        microops.front()->setFirstMicroop();
+        microops.back()->setLastMicroop();
+    }
+}};
+
+def template CmMvExecute {{
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(StackRegs[machInst.r1s])
+            << ", " << registerName(StackRegs[machInst.r2s]);
+        return ss.str();
+    }
+}};
+
+def template CmMvMicroDeclare {{
+    class %(class_name)s : public %(base_class)s
+    {
+      public:
+        %(class_name)s(ExtMachInst machInst, RegId push_reg, RegId pop_reg);
+      protected:
+        using %(base_class)s::%(base_class)s;
+
+        Fault execute(ExecContext *, trace::InstRecord *) const override;
+        std::string generateDisassembly(
+            Addr, const loader::SymbolTable *) const override;
+
+      private:
+        %(reg_idx_arr_decl)s;
+    };
+}};
+
+def template CmMvMicroConstructor {{
+    %(class_name)s::%(class_name)s(
+        ExtMachInst machInst, RegId push_reg, RegId pop_reg)
+        : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
+    {
+        %(set_reg_idx_arr)s;
+        %(constructor)s;
+    }
+}};
+
+def template CmMvMicroExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        %(op_decl)s;
+        %(op_rd)s;
+        %(code)s;
+        %(op_wb)s;
+        return NoFault;
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ' '
+            << registerName(srcRegIdx(0));
+        return ss.str();
+    }
+}};
+
+def format CmPush(*flags) {{
+    code = ''
+    macro_iop = InstObjParams(name, Name, 'CmMacroInst', code, flags)
+    header_output = CmPushDeclare.subst(macro_iop)
+    decoder_output = CmPushConstructor.subst(macro_iop)
+    exec_output = CmPushExecute.subst(macro_iop)
+    decode_block = BasicDecode.subst(macro_iop)
+
+    memacc_code = 'Mem_sw = CmPushReg_sw;'
+    ea_code = 'EA = rvSext(sp + offset);'
+    micro32_iop = InstObjParams('lw', f'{Name}32MicroInst', 'RiscvMicroInst',
+        {'ea_code': ea_code, 'memacc_code': memacc_code},
+        flags)
+
+    mem_flags = [getAlignFlag(micro32_iop)]
+    s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+    micro32_iop.constructor += s
+
+    header_output += CmStoreMicroDeclare.subst(micro32_iop)
+    decoder_output += CmStoreMicroConstructor.subst(micro32_iop)
+    exec_output += CmStoreMicroExecute.subst(micro32_iop) \
+        + CmStoreMicroInitiateAcc.subst(micro32_iop) \
+        + CmStoreMicroCompleteAcc.subst(micro32_iop)
+
+    memacc_code = 'Mem = CmPushReg;'
+    ea_code = 'EA = rvSext(sp + offset);'
+    micro64_iop = InstObjParams('ld', f'{Name}64MicroInst', 'RiscvMicroInst',
+        {'ea_code': ea_code, 'memacc_code': memacc_code},
+        flags)
+
+    mem_flags = [getAlignFlag(micro64_iop)]
+    s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+    micro64_iop.constructor += s
+
+    header_output += CmStoreMicroDeclare.subst(micro64_iop)
+    decoder_output += CmStoreMicroConstructor.subst(micro64_iop)
+    exec_output += CmStoreMicroExecute.subst(micro64_iop) \
+        + CmStoreMicroInitiateAcc.subst(micro64_iop) \
+        + CmStoreMicroCompleteAcc.subst(micro64_iop)
+
+    code = 'spd = rvSext(sp + adj);'
+    sp_adj_iop = InstObjParams('addi', f'{Name}SpAdjMicroInst',
+        'RiscvMicroInst', code, flags)
+
+    header_output += SpAdjMicroDeclare.subst(sp_adj_iop)
+    decoder_output += SpAdjMicroConstructor.subst(sp_adj_iop)
+    exec_output += SpAdjMicroExecute.subst(sp_adj_iop)
+}};
+
+def format CmPop(is_ret=False, has_a0=False, *flags) {{
+    code = ''
+    flags = []
+    has_a0 = eval(has_a0)
+    is_ret = eval(is_ret)
+    move_a0_desc = ''
+    return_desc = ''
+
+    if has_a0:
+        move_a0_desc = rf'''
+          cur_inst = new {Name}MvMicroInst(
+              machInst, ReturnValueReg, int_reg::Zero);
+          microops.emplace_back(cur_inst);
+       '''
+
+    if is_ret:
+        return_desc = rf'''
+          cur_inst = new {Name}RetMicroInst(machInst);
+          microops.emplace_back(cur_inst);
+       '''
+
+    macro_iop = InstObjParams(name, Name, 'CmMacroInst',
+        {'code': code, 'move_a0_desc': move_a0_desc,
+         'return_desc': return_desc},
+        flags)
+    header_output = CmPopDeclare.subst(macro_iop)
+    decoder_output = CmPopConstructor.subst(macro_iop)
+    exec_output = CmPopExecute.subst(macro_iop)
+    decode_block = BasicDecode.subst(macro_iop)
+
+    memacc_code = 'CmPopReg_sw = Mem_sw;'
+    ea_code = 'EA = rvSext(sp + offset);'
+    micro32_iop = InstObjParams('lw', f'{Name}32MicroInst', 'RiscvMicroInst',
+        {'ea_code': ea_code, 'memacc_code': memacc_code},
+        flags)
+
+    mem_flags = [getAlignFlag(micro32_iop)]
+    s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+    micro32_iop.constructor += s
+
+    header_output += CmLoadMicroDeclare.subst(micro32_iop)
+    decoder_output += CmLoadMicroConstructor.subst(micro32_iop)
+    exec_output += CmLoadMicroExecute.subst(micro32_iop) \
+        + CmLoadMicroInitiateAcc.subst(micro32_iop) \
+        + CmLoadMicroCompleteAcc.subst(micro32_iop)
+
+    memacc_code = 'CmPopReg = Mem;'
+    ea_code = 'EA = rvSext(sp + offset);'
+    micro64_iop = InstObjParams('ld', f'{Name}64MicroInst', 'RiscvMicroInst',
+        {'ea_code': ea_code, 'memacc_code': memacc_code},
+        flags)
+
+    mem_flags = [getAlignFlag(micro64_iop)]
+    s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+    micro64_iop.constructor += s
+
+    header_output += CmLoadMicroDeclare.subst(micro64_iop)
+    decoder_output += CmLoadMicroConstructor.subst(micro64_iop)
+    exec_output += CmLoadMicroExecute.subst(micro64_iop) \
+        + CmLoadMicroInitiateAcc.subst(micro64_iop) \
+        + CmLoadMicroCompleteAcc.subst(micro64_iop)
+
+    code = 'spd = rvSext(sp + adj);'
+    sp_adj_iop = InstObjParams('addi', f'{Name}SpAdjMicroInst',
+        'RiscvMicroInst', code, flags)
+
+    header_output += SpAdjMicroDeclare.subst(sp_adj_iop)
+    decoder_output += SpAdjMicroConstructor.subst(sp_adj_iop)
+    exec_output += SpAdjMicroExecute.subst(sp_adj_iop)
+
+    if has_a0:
+        code = 'CmPopReg = CmPushReg;'
+        has_a0_iop = InstObjParams('mv', f'{Name}MvMicroInst',
+            'RiscvMicroInst', code, flags)
+
+        header_output += CmMvMicroDeclare.subst(has_a0_iop)
+        decoder_output += CmMvMicroConstructor.subst(has_a0_iop)
+        exec_output += CmMvMicroExecute.subst(has_a0_iop)
+
+    if is_ret:
+        code = 'NPC = rvSext(ra & (~0x1));'
+        ret_flags = ['IsIndirectControl', 'IsUncondControl', 'IsReturn']
+        is_ret_iop = InstObjParams('jr', f'{Name}RetMicroInst',
+            'RiscvMicroInst', code, ret_flags)
+
+        header_output += CmRetMicroDeclare.subst(is_ret_iop)
+        decoder_output += CmRetMicroConstructor.subst(is_ret_iop)
+        exec_output += CmRetMicroExecute.subst(is_ret_iop)
+}};
+
+def format CmMvsa01() {{
+    code = ''
+    flags = []
+    iop = InstObjParams(name, Name, 'RiscvMacroInst', code, flags)
+    header_output = CmMvDeclare.subst(iop)
+    decoder_output = CmMvsa01Constructor.subst(iop)
+    exec_output = CmMvExecute.subst(iop)
+    decode_block = BasicDecode.subst(iop)
+
+    code = 'CmPopReg = CmPushReg;'
+    micro_iop = InstObjParams('mv', f'{Name}MvMicroInst', 'RiscvMicroInst',
+        code, flags)
+
+    header_output += CmMvMicroDeclare.subst(micro_iop)
+    decoder_output += CmMvMicroConstructor.subst(micro_iop)
+    exec_output += CmMvMicroExecute.subst(micro_iop)
+}};
+
+def format CmMva01s() {{
+    code = ''
+    flags = []
+    iop = InstObjParams(name, Name, 'RiscvMacroInst', code, flags)
+    header_output = CmMvDeclare.subst(iop)
+    decoder_output = CmMva01sConstructor.subst(iop)
+    exec_output = CmMvExecute.subst(iop)
+    decode_block = BasicDecode.subst(iop)
+
+    code = 'CmPopReg = CmPushReg;'
+    micro_iop = InstObjParams('mv', f'{Name}MvMicroInst', 'RiscvMicroInst',
+        code, flags)
+
+    header_output += CmMvMicroDeclare.subst(micro_iop)
+    decoder_output += CmMvMicroConstructor.subst(micro_iop)
+    exec_output += CmMvMicroExecute.subst(micro_iop)
+}};
--- a/src/arch/riscv/isa/includes.isa
+++ b/src/arch/riscv/isa/includes.isa
@@ -55,6 +55,7 @@ output header {{
 #include "arch/riscv/insts/static_inst.hh"
 #include "arch/riscv/insts/unknown.hh"
 #include "arch/riscv/insts/vector.hh"
+#include "arch/riscv/insts/zcmp.hh"
 #include "arch/riscv/interrupts.hh"
 #include "cpu/static_inst.hh"
 #include "mem/packet.hh"
--- a/src/arch/riscv/isa/operands.isa
+++ b/src/arch/riscv/isa/operands.isa
@@ -70,10 +70,14 @@ def operands {{
    'Rp2': IntReg('ud', 'RP2 + 8', 'IsInteger', 3),
    'ra': IntReg('ud', 'ReturnAddrReg', 'IsInteger', 1),
    'sp': IntReg('ud', 'StackPointerReg', 'IsInteger', 2),
+    'spd': IntReg('ud', 'StackPointerReg', 'IsInteger', 1),

    'a0': IntReg('ud', '10', 'IsInteger', 1),
    'a1': IntReg('ud', '11', 'IsInteger', 2),

+    'CmPushReg': IntReg('ud', 'push_reg', 'IsInteger', 3),
+    'CmPopReg': IntReg('ud', 'pop_reg', 'IsInteger', 1),
+
    'Fd': FloatRegOp('df', 'FD', 'IsFloating', 1),
    'Fd_bits': FloatRegOp('ud', 'FD', 'IsFloating', 1),
    'Fs1': FloatRegOp('df', 'FS1', 'IsFloating', 2),
--- a/src/arch/riscv/linux/linux.hh
+++ b/src/arch/riscv/linux/linux.hh
@@ -34,6 +34,7 @@
 #include "arch/riscv/utility.hh"
 #include "kern/linux/flag_tables.hh"
 #include "kern/linux/linux.hh"
+#include "base/bitfield.hh"

 namespace gem5
 {
@@ -42,6 +43,101 @@ class RiscvLinux : public Linux
 {
  public:
    static const ByteOrder byteOrder = ByteOrder::little;
+
+    enum RiscvHwprobeKey
+    {
+        Mvendorid,
+        Marchid,
+        Mimpid,
+        BaseBehavior,
+        IMAExt0,
+        Cpuperf0,
+        ZicbozBlockSize,
+        HighestVirtAddress,
+        TimeCsrFreq,
+        MisalignedScalarPerf
+    };
+
+    /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
+    #define RISCV_HWPROBE_MAX_KEY 9
+
+    BitUnion64(key_base_behavior_t)
+    Bitfield<0> ima;
+    EndBitUnion(key_base_behavior_t)
+
+    BitUnion64(key_ima_ext_0_t)
+        Bitfield<49> ZAWRS;
+        Bitfield<48> ZCMOP;
+        Bitfield<47> ZCF;
+        Bitfield<46> ZCD;
+        Bitfield<45> ZCB;
+        Bitfield<44> ZCA;
+        Bitfield<43> ZIMOP;
+        Bitfield<42> ZVE64D;
+        Bitfield<41> ZVE64F;
+        Bitfield<40> ZVE64X;
+        Bitfield<39> ZVE32F;
+        Bitfield<38> ZVE32X;
+        Bitfield<37> ZIHINTPAUSE;
+        Bitfield<36> ZICOND;
+        Bitfield<35> ZACAS;
+        Bitfield<34> ZTSO;
+        Bitfield<33> ZFA;
+        Bitfield<32> ZVFHMIN;
+        Bitfield<31> ZVFH;
+        Bitfield<30> ZIHINTNTL;
+        Bitfield<29> ZFHMIN;
+        Bitfield<28> ZFH;
+        Bitfield<27> ZVKT;
+        Bitfield<26> ZVKSH;
+        Bitfield<25> ZVKSED;
+        Bitfield<24> ZVKNHB;
+        Bitfield<22> ZVKNHA;
+        Bitfield<21> ZVKNED;
+        Bitfield<20> ZVKG;
+        Bitfield<19> ZVKB;
+        Bitfield<18> ZVBC;
+        Bitfield<17> ZVBB;
+        Bitfield<16> ZKT;
+        Bitfield<15> ZKSH;
+        Bitfield<14> ZKSED;
+        Bitfield<13> ZKNH;
+        Bitfield<12> ZKNE;
+        Bitfield<11> ZKND;
+        Bitfield<10> ZBKX;
+        Bitfield<9>  ZBKC;
+        Bitfield<8>  ZBKB;
+        Bitfield<7>  ZBC;
+        Bitfield<6>  ZICBOZ;
+        Bitfield<5>  ZBS;
+        Bitfield<4>  ZBB;
+        Bitfield<3>  ZBA;
+        Bitfield<2>  V;
+        Bitfield<1>  C;
+        Bitfield<0>  FD;
+    EndBitUnion(key_ima_ext_0_t)
+
+    enum MisalignedScalarPerf
+    {
+        Unknown,
+        Emulated,
+        Slow,
+        Fast,
+        Unsupported
+    };
+
+    /* Flags */
+    #define RISCV_HWPROBE_WHICH_CPUS	(1 << 0)
+
+    struct riscv_hwprobe {
+        int64_t  key;
+        uint64_t value;
+    };
+
+    typedef struct cpumask {
+        size_t size;
+        uint64_t bits[];
+    } cpumask_t;
 };

 class RiscvLinux64 : public RiscvLinux, public OpenFlagTable<RiscvLinux64>
@@ -195,6 +291,21 @@ class RiscvLinux64 : public RiscvLinux, public OpenFlagTable<RiscvLinux64>
        uint32_t mem_unit;
    };

+    struct tgt_clone_args
+    {
+        uint64_t flags;
+        uint64_t pidfd;
+        uint64_t child_tid;
+        uint64_t parent_tid;
+        uint64_t exit_signal;
+        uint64_t stack;
+        uint64_t stack_size;
+        uint64_t tls;
+        uint64_t set_tid;
+        uint64_t set_tid_size;
+        uint64_t cgroup;
+    };
+
    static void
    archClone(uint64_t flags,
              Process *pp, Process *cp,
--- a/src/arch/riscv/linux/se_workload.cc
+++ b/src/arch/riscv/linux/se_workload.cc
@@ -44,6 +44,8 @@
 #include <sys/syscall.h>

 #include "arch/riscv/process.hh"
+#include "arch/riscv/insts/static_inst.hh"
+#include "arch/riscv/regs/misc.hh"
 #include "base/loader/object_file.hh"
 #include "base/trace.hh"
 #include "cpu/thread_context.hh"
@@ -134,6 +136,388 @@ unameFunc32(SyscallDesc *desc, ThreadContext *tc, VPtr<Linux::utsname> name)
    return 0;
 }

+static inline void
+cpumask_set_cpu(unsigned int cpu, RiscvLinux::cpumask_t *dstp)
+{
+    assert(cpu < dstp->size * 8);
+    auto &bits = dstp->bits[cpu / sizeof(uint64_t)];
+    bits = insertBits(bits, cpu % sizeof(uint64_t), 1);
+}
+
+static inline void
+cpumask_clear_cpu(unsigned int cpu, RiscvLinux::cpumask_t *dstp)
+{
+    assert(cpu < dstp->size * 8);
+    auto &bits = dstp->bits[cpu / sizeof(uint64_t)];
+    bits = insertBits(bits, cpu % sizeof(uint64_t), 0);
+}
+
+static inline bool
+cpumask_test_cpu(unsigned int cpu, const RiscvLinux::cpumask_t *cpumask)
+{
+    assert(cpu < cpumask->size * 8);
+    return bits(cpumask->bits[cpu / sizeof(uint64_t)], cpu % sizeof(uint64_t)) != 0;
+}
+
+static inline void
+cpumask_and(RiscvLinux::cpumask_t *dstp, const RiscvLinux::cpumask_t *src1p,
+            const RiscvLinux::cpumask_t *src2p)
+{
+    assert(dstp->size == src1p->size);
+    assert(dstp->size == src2p->size);
+    for (size_t i = 0; i < dstp->size / sizeof(dstp->bits[0]); i++) {
+        dstp->bits[i] = src1p->bits[i] & src2p->bits[i];
+    }
+}
+
+static inline bool
+cpumask_empty(const RiscvLinux::cpumask_t *dstp)
+{
+    for (size_t i = 0; i < dstp->size / sizeof(dstp->bits[0]); i++) {
+        if (dstp->bits[i] != 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static inline void
+cpumask_copy(RiscvLinux::cpumask_t *dstp, const RiscvLinux::cpumask_t *srcp)
+{
+    assert(dstp->size == srcp->size);
+    memcpy(dstp->bits, srcp->bits, srcp->size);
+}
+
+static inline void
+cpumask_clear(RiscvLinux::cpumask_t *dstp)
+{
+    memset(dstp->bits, 0, dstp->size);
+}
+
+static inline RiscvLinux::cpumask_t *
+cpumask_malloc(ThreadContext *tc)
+{
+    RiscvLinux::cpumask_t *cpumask;
+
+    /* 8-bytes up-boundary alignment */
+    size_t size = (tc->getSystemPtr()->threads.size() + sizeof(cpumask->bits[0]) - 1) /
+                    sizeof(cpumask->bits[0]) * sizeof(cpumask->bits[0]);
+    cpumask = (RiscvLinux::cpumask_t *)malloc(sizeof(cpumask->size) + size);
+    if (cpumask != nullptr) {
+        cpumask->size = size;
+        cpumask_clear(cpumask);
+    }
+
+    return cpumask;
+}
+
+static inline void
+cpumask_free(RiscvLinux::cpumask_t *cpu_online_mask)
+{
+    free(cpu_online_mask);
+}
+
+static inline bool
+riscv_hwprobe_key_is_valid(int64_t key)
+{
+    return key >= 0 && key <= RISCV_HWPROBE_MAX_KEY;
+}
+
+static inline bool
+hwprobe_key_is_bitmask(int64_t key)
+{
+    switch (key) {
+    case RiscvLinux::BaseBehavior:
+    case RiscvLinux::IMAExt0:
+    case RiscvLinux::Cpuperf0:
+        return true;
+    }
+
+    return false;
+}
+
+static inline bool
+riscv_hwprobe_pair_cmp(RiscvLinux::riscv_hwprobe *pair,
+                       RiscvLinux::riscv_hwprobe *other_pair)
+{
+    if (pair->key != other_pair->key) {
+        return false;
+    }
+
+    if (hwprobe_key_is_bitmask(pair->key)) {
+        return (pair->value & other_pair->value) == other_pair->value;
+    }
+
+    return pair->value == other_pair->value;
+}
+
+static inline RiscvLinux::cpumask_t *
+get_cpu_online_mask(ThreadContext *tc)
+{
+    RiscvLinux::cpumask_t *cpu_online_mask = cpumask_malloc(tc);
+    if (cpu_online_mask != nullptr) {
+        for (int i = 0; i < tc->getSystemPtr()->threads.size(); i++) {
+            CPU_SET(i, (cpu_set_t *)&cpu_online_mask->bits);
+        }
+    }
+
+    return cpu_online_mask;
+}
+
+static void
+hwprobe_one_pair(ThreadContext *tc, RiscvLinux::riscv_hwprobe *pair,
+                 RiscvLinux::cpumask_t *cpus)
+{
+    switch (pair->key) {
+    case RiscvLinux::Mvendorid:
+        pair->value = tc->readMiscRegNoEffect(CSRData.at(CSR_MVENDORID).physIndex);
+        break;
+    case RiscvLinux::Marchid:
+        pair->value = tc->readMiscRegNoEffect(CSRData.at(CSR_MARCHID).physIndex);
+        break;
+    case RiscvLinux::Mimpid:
+        pair->value = tc->readMiscRegNoEffect(CSRData.at(CSR_MIMPID).physIndex);
+        break;
+    case RiscvLinux::BaseBehavior:
+        {
+            MISA misa = tc->readMiscRegNoEffect(MISCREG_ISA);
+            RiscvLinux::key_base_behavior_t *base_behavior =
+                (RiscvLinux::key_base_behavior_t *)&pair->value;
+            if (misa.rvi && misa.rvm && misa.rva) {
+                base_behavior->ima = 1;
+            }
+        }
+        break;
+    case RiscvLinux::IMAExt0:
+        {
+            MISA misa = tc->readMiscRegNoEffect(MISCREG_ISA);
+            RiscvLinux::key_ima_ext_0_t *ext = (RiscvLinux::key_ima_ext_0_t *)&pair->value;
+            if (misa.rvf && misa.rvd) ext->FD = 1;
+            if (misa.rvc) ext->C = 1;
+            if (misa.rvv) ext->V = 1;
+            ext->ZBA = 1;
+            ext->ZBB = 1;
+            ext->ZBS = 1;
+            ext->ZICBOZ = 1;
+            ext->ZBC = 1;
+            ext->ZBKB = 1;
+            ext->ZBKC = 1;
+            ext->ZBKX = 1;
+            ext->ZKND = 1;
+            ext->ZKNE = 1;
+            ext->ZKNH = 1;
+            ext->ZKSED = 1;
+            ext->ZKSH = 1;
+            ext->ZKT = 1;
+            ext->ZFH = 1;
+            ext->ZFHMIN = 1;
+            ext->ZVFH = 1;
+            ext->ZVFHMIN = 1;
+            ext->ZICOND = 1;
+            ext->ZVE64D = 1;
+            ext->ZCB = 1;
+            ext->ZCD = 1;
+            ext->ZCF = 1;
+        }
+        break;
+    case RiscvLinux::Cpuperf0:
+    case RiscvLinux::MisalignedScalarPerf:
+        pair->value = RiscvLinux::Slow;
+        break;
+    case RiscvLinux::ZicbozBlockSize:
+        pair->value = tc->getSystemPtr()->cacheLineSize();
+        break;
+    case RiscvLinux::HighestVirtAddress:
+        pair->value = tc->getProcessPtr()->memState->getMmapEnd();
+        break;
+
+    /*
+     * For forward compatibility, unknown keys don't fail the whole
+     * call, but get their element key set to -1 and value set to 0
+     * indicating they're unrecognized.
+     */
+    default:
+        pair->key = -1;
+        pair->value = 0;
+        break;
+    }
+}
+
+template <class OS>
+static int
+hwprobe_get_values(ThreadContext *tc, VPtr<> pairs, typename OS::size_t pair_count,
+                   typename OS::size_t cpusetsize, VPtr<> cpus_user, unsigned int flags)
+{
+    /* Check the reserved flags. */
+    if (flags != 0) {
+        return -EINVAL;
+    }
+
+    RiscvLinux::cpumask_t *cpu_online_mask = get_cpu_online_mask(tc);
+    if (cpu_online_mask == nullptr) {
+        return -ENOMEM;
+    }
+
+    RiscvLinux::cpumask_t *cpus = cpumask_malloc(tc);
+    if (cpus == nullptr) {
+        cpumask_free(cpu_online_mask);
+        return -ENOMEM;
+    }
+
+    if (cpusetsize > cpu_online_mask->size) {
+        cpusetsize = cpu_online_mask->size;
+    }
+
+    RiscvLinux::riscv_hwprobe *pair;
+    BufferArg pairs_buf(pairs, sizeof(RiscvLinux::riscv_hwprobe) * pair_count);
+
+    /*
+    * The interface supports taking in a CPU mask, and returns values that
+    * are consistent across that mask. Allow userspace to specify NULL and
+    * 0 as a shortcut to all online CPUs.
+    */
+    if (cpusetsize == 0 && !cpus_user) {
+        cpumask_copy(cpus, cpu_online_mask);
+        cpusetsize = cpu_online_mask->size;
+    } else {
+        BufferArg cpus_user_buf(cpus_user, cpusetsize);
+        cpus_user_buf.copyIn(SETranslatingPortProxy(tc));
+
+        cpu_online_mask->size = cpusetsize;
+        cpus->size = cpusetsize;
+        memcpy(cpus->bits, cpus_user_buf.bufferPtr(), cpusetsize);
+
+        /*
+        * Userspace must provide at least one online CPU, without that
+        * there's no way to define what is supported.
+        */
+        cpumask_and(cpus, cpus, cpu_online_mask);
+        if (cpumask_empty(cpus)) {
+            cpumask_free(cpu_online_mask);
+            cpumask_free(cpus);
+            return -EINVAL;
+        }
+    }
+
+    pairs_buf.copyIn(SETranslatingPortProxy(tc));
+    pair = (RiscvLinux::riscv_hwprobe *)pairs_buf.bufferPtr();
+
+    for (size_t i = 0; i < pair_count; i++, pair++) {
+        pair->value = 0;
+        hwprobe_one_pair(tc, pair, cpus);
+    }
+
+    pairs_buf.copyOut(SETranslatingPortProxy(tc));
+
+    cpumask_free(cpu_online_mask);
+    cpumask_free(cpus);
+
+    return 0;
+}
+
+template <class OS>
+static int
+hwprobe_get_cpus(ThreadContext *tc, VPtr<> pairs, typename OS::size_t pair_count,
+                 typename OS::size_t cpusetsize, VPtr<> cpus_user, unsigned int flags)
+{
+    if (flags != RISCV_HWPROBE_WHICH_CPUS) {
+        return -EINVAL;
+    }
+
+    if (cpusetsize == 0 || !cpus_user) {
+        return -EINVAL;
+    }
+
+    RiscvLinux::cpumask_t *cpu_online_mask = get_cpu_online_mask(tc);
+    if (cpu_online_mask == nullptr) {
+        return -ENOMEM;
+    }
+
+    RiscvLinux::cpumask_t *cpus = cpumask_malloc(tc);
+    if (cpus == nullptr) {
+        cpumask_free(cpu_online_mask);
+        return -ENOMEM;
+    }
+
+    RiscvLinux::cpumask_t *one_cpu = cpumask_malloc(tc);
+    if (one_cpu == nullptr) {
+        cpumask_free(cpu_online_mask);
+        cpumask_free(cpus);
+        return -ENOMEM;
+    }
+
+    if (cpusetsize > cpu_online_mask->size) {
+        cpusetsize = cpu_online_mask->size;
+    }
+
+    RiscvLinux::riscv_hwprobe *pair;
+    BufferArg cpus_user_buf(cpus_user, cpusetsize);
+    cpus_user_buf.copyIn(SETranslatingPortProxy(tc));
+    memcpy(cpus->bits, cpus_user_buf.bufferPtr(), cpusetsize);
+
+    if (cpumask_empty(cpus)) {
+        cpumask_copy(cpus, cpu_online_mask);
+        cpusetsize = cpu_online_mask->size;
+    }
+
+    cpumask_and(cpus, cpus, cpu_online_mask);
+
+    BufferArg pairs_buf(pairs, sizeof(RiscvLinux::riscv_hwprobe) * pair_count);
+    pairs_buf.copyIn(SETranslatingPortProxy(tc));
+    pair = (RiscvLinux::riscv_hwprobe *)pairs_buf.bufferPtr();
+
+    for (size_t i = 0; i < pair_count; i++, pair++) {
+        if (!riscv_hwprobe_key_is_valid(pair->key)) {
+            *pair = (RiscvLinux::riscv_hwprobe){ .key = -1, .value = 0 };
+            memset(cpus_user_buf.bufferPtr(), 0, cpusetsize);
+            break;
+        }
+
+        RiscvLinux::riscv_hwprobe tmp =
+            (RiscvLinux::riscv_hwprobe){ .key = pair->key, .value = 0 };
+
+        for (int cpu = 0; cpu < cpusetsize * 8; cpu++) {
+            if (!cpumask_test_cpu(cpu, cpus)) {
+                continue;
+            }
+
+            cpumask_set_cpu(cpu, one_cpu);
+
+            hwprobe_one_pair(tc, &tmp, one_cpu);
+
+            if (!riscv_hwprobe_pair_cmp(&tmp, pair)) {
+                cpumask_clear_cpu(cpu, cpus);
+            }
+
+            cpumask_clear_cpu(cpu, one_cpu);
+        }
+    }
+
+    pairs_buf.copyOut(SETranslatingPortProxy(tc));
+    cpus_user_buf.copyOut(SETranslatingPortProxy(tc));
+
+    cpumask_free(cpu_online_mask);
+    cpumask_free(cpus);
+    cpumask_free(one_cpu);
+
+    return 0;
+}
+
+template <class OS>
+static SyscallReturn
+riscvHWProbeFunc(SyscallDesc *desc, ThreadContext *tc, VPtr<> pairs,
+                 typename OS::size_t pair_count, typename OS::size_t cpusetsize,
+                 VPtr<> cpus_user, unsigned int flags)
+{
+    if (flags & RISCV_HWPROBE_WHICH_CPUS) {
+        return hwprobe_get_cpus<OS>(tc, pairs, pair_count, cpusetsize,
+                                    cpus_user, flags);
+    }
+
+    return hwprobe_get_values<OS>(tc, pairs, pair_count, cpusetsize,
+                                  cpus_user, flags);
+}
+
 SyscallDescTable<SEWorkload::SyscallABI64> EmuLinux::syscallDescs64 = {
    { 0,    "io_setup" },
    { 1,    "io_destroy" },
@@ -382,6 +766,7 @@ SyscallDescTable<SEWorkload::SyscallABI64> EmuLinux::syscallDescs64 = {
    { 241,  "perf_event_open" },
    { 242,  "accept4" },
    { 243,  "recvmmsg" },
+    { 258,  "riscv_hwprobe", riscvHWProbeFunc<RiscvLinux64> },
    { 260,  "wait4", wait4Func<RiscvLinux64> },
    { 261,  "prlimit64", prlimitFunc<RiscvLinux64> },
    { 262,  "fanotify_init" },
@@ -410,6 +795,33 @@ SyscallDescTable<SEWorkload::SyscallABI64> EmuLinux::syscallDescs64 = {
    { 285,  "copy_file_range" },
    { 286,  "preadv2" },
    { 287,  "pwritev2" },
+    { 424,  "pidfd_send_signal" },
+    { 425,  "io_uring_setup" },
+    { 426,  "io_uring_enter" },
+    { 427,  "io_uring_register" },
+    { 428,  "open_tree" },
+    { 429,  "move_mount" },
+    { 430,  "fsopen" },
+    { 431,  "fsconfig" },
+    { 432,  "fsmount" },
+    { 433,  "fspick" },
+    { 434,  "pidfd_open" },
+    { 435,  "clone3", clone3Func<RiscvLinux64> },
+    { 436,  "close_range" },
+    { 437,  "openat2" },
+    { 438,  "pidfd_getfd" },
+    { 439,  "faccessat2" },
+    { 440,  "process_madvise" },
+    { 441,  "epoll_pwait2" },
+    { 442,  "mount_setattr" },
+    { 443,  "quotactl_fd" },
+    { 444,  "landlock_create_ruleset" },
+    { 445,  "landlock_add_rule" },
+    { 446,  "landlock_restrict_self" },
+    { 447,  "memfd_secret" },
+    { 448,  "process_mrelease" },
+    { 449,  "futex_waitv" },
+    { 450,  "set_mempolicy_home_node" },
    { 1024, "open", openFunc<RiscvLinux64> },
    { 1025, "link", linkFunc },
    { 1026, "unlink", unlinkFunc },
@@ -721,6 +1133,7 @@ SyscallDescTable<SEWorkload::SyscallABI32> EmuLinux::syscallDescs32 = {
    { 241,  "perf_event_open" },
    { 242,  "accept4" },
    { 243,  "recvmmsg" },
+    { 258,  "riscv_hwprobe", riscvHWProbeFunc<RiscvLinux32> },
    { 260,  "wait4", wait4Func<RiscvLinux32> },
    { 261,  "prlimit64", prlimitFunc<RiscvLinux32> },
    { 262,  "fanotify_init" },
--- a/src/arch/riscv/regs/int.hh
+++ b/src/arch/riscv/regs/int.hh
@@ -149,6 +149,18 @@ inline constexpr RegId ArgumentRegs[] = {
    int_reg::A4, int_reg::A5, int_reg::A6, int_reg::A7
 };

+const std::vector<RegId> PushPopRegList = {
+    int_reg::S11, int_reg::S10, int_reg::S9, int_reg::S8,
+    int_reg::S7, int_reg::S6, int_reg::S5, int_reg::S4,
+    int_reg::S3, int_reg::S2, int_reg::S1, int_reg::S0,
+    int_reg::Ra
+};
+
+inline constexpr RegId StackRegs[] = {
+  int_reg::S0, int_reg::S1, int_reg::S2, int_reg::S3,
+  int_reg::S4, int_reg::S5, int_reg::S6, int_reg::S7,
+};
+
 } // namespace RiscvISA
 } // namespace gem5

--- a/src/arch/riscv/types.hh
+++ b/src/arch/riscv/types.hh
@@ -58,6 +58,7 @@ BitUnion64(ExtMachInst)
    // Decoder state
    Bitfield<63, 62>    rv_type;
    Bitfield<61>        compressed;
+    Bitfield<60>        enable_zcd;
    // More bits for vector extension
    Bitfield<57, 41>    vl;     // [0, 2**16]
    Bitfield<40>        vill;
@@ -126,6 +127,8 @@ BitUnion64(ExtMachInst)
    Bitfield< 6,  2>    rc2;
    Bitfield< 9,  7>    rp1;
    Bitfield< 4,  2>    rp2;
+    Bitfield< 9,  7>    r1s;
+    Bitfield< 4,  2>    r2s;
    Bitfield<11,  7>    fc1;
    Bitfield< 6,  2>    fc2;
    Bitfield< 4,  2>    fp2;
@@ -144,6 +147,8 @@ BitUnion64(ExtMachInst)
    Bitfield<12, 10>    cimm3;
    Bitfield< 6,  5>    cimm2;
    Bitfield<12>        cimm1;
+    Bitfield< 7,  4>    rlist;
+    Bitfield< 3,  2>    spimm;
    // Pseudo instructions
    Bitfield<31, 25>    m5func;
    // vector
--- a/src/arch/x86/decoder.cc
+++ b/src/arch/x86/decoder.cc
@@ -41,8 +41,6 @@ namespace gem5
 namespace X86ISA
 {

-X86ISAInst::MicrocodeRom Decoder::microcodeRom;
-
 Decoder::State
 Decoder::doResetState()
 {
@@ -671,9 +669,6 @@ Decoder::doImmediateState()
    return nextState;
 }

-Decoder::InstBytes Decoder::dummy;
-Decoder::InstCacheMap Decoder::instCacheMap;
-
 StaticInstPtr
 Decoder::decode(ExtMachInst mach_inst, Addr addr)
 {
--- a/src/arch/x86/decoder.hh
+++ b/src/arch/x86/decoder.hh
@@ -60,19 +60,19 @@ class Decoder : public InstDecoder
    // These are defined and documented in decoder_tables.cc
    static const uint8_t SizeTypeToSize[3][10];
    typedef const uint8_t ByteTable[256];
-    static ByteTable Prefixes[2];
+    static const ByteTable Prefixes[2];

-    static ByteTable UsesModRMOneByte;
-    static ByteTable UsesModRMTwoByte;
-    static ByteTable UsesModRMThreeByte0F38;
-    static ByteTable UsesModRMThreeByte0F3A;
+    static const ByteTable UsesModRMOneByte;
+    static const ByteTable UsesModRMTwoByte;
+    static const ByteTable UsesModRMThreeByte0F38;
+    static const ByteTable UsesModRMThreeByte0F3A;

-    static ByteTable ImmediateTypeOneByte;
-    static ByteTable ImmediateTypeTwoByte;
-    static ByteTable ImmediateTypeThreeByte0F38;
-    static ByteTable ImmediateTypeThreeByte0F3A;
+    static const ByteTable ImmediateTypeOneByte;
+    static const ByteTable ImmediateTypeTwoByte;
+    static const ByteTable ImmediateTypeThreeByte0F38;
+    static const ByteTable ImmediateTypeThreeByte0F3A;

-    static X86ISAInst::MicrocodeRom microcodeRom;
+    X86ISAInst::MicrocodeRom microcodeRom;

  protected:
    using MachInst = uint64_t;
@@ -88,7 +88,7 @@ class Decoder : public InstDecoder
        {}
    };

-    static InstBytes dummy;
+    InstBytes dummy;

    // The bytes to be predecoded.
    MachInst fetchChunk;
@@ -244,7 +244,7 @@ class Decoder : public InstDecoder
    decode_cache::InstMap<ExtMachInst> *instMap = nullptr;
    typedef std::unordered_map<
            CacheKey, decode_cache::InstMap<ExtMachInst> *> InstCacheMap;
-    static InstCacheMap instCacheMap;
+    InstCacheMap instCacheMap;

    StaticInstPtr decodeInst(ExtMachInst mach_inst);

--- a/src/base/stats/units.hh
+++ b/src/base/stats/units.hh
@@ -350,9 +350,9 @@ class Rate : public Base
        "otherwise, it would be a Ratio");

  private:
-    Rate<T1,T2>() {}
+    Rate() {}
  public:
-    Rate<T1,T2>(Rate<T1,T2> const&) = delete;
+    Rate(Rate const&) = delete;
    void operator=(Rate<T1,T2> const&) = delete;
    static Rate<T1,T2>*
    get()
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -240,7 +240,11 @@ BaseCPU::postInterrupt(ThreadID tid, int int_num, int index)
    // Only wake up syscall emulation if it is not waiting on a futex.
    // This is to model the fact that instructions such as ARM SEV
    // should wake up a WFE sleep, but not a futex syscall WAIT.
-    if (FullSystem || !system->futexMap.is_waiting(threadContexts[tid]))
+    //
+    // For RISC-V, the WFI sleep wake up is implementation defined.
+    // The SiFive WFI wake up the hart only if mip & mie != 0
+    if ((FullSystem && interrupts[tid]->isWakeUp()) ||
+        !system->futexMap.is_waiting(threadContexts[tid]))
        wakeup(tid);
 }

@@ -855,13 +859,13 @@ BaseCPU::GlobalStats::GlobalStats(statistics::Group *parent)
             "Simulator op (including micro ops) rate (op/s)")
 {
    simInsts
-        .functor(BaseCPU::numSimulatedInsts)
+        .functor(BaseCPU::GlobalStats::numSimulatedInsts)
        .precision(0)
        .prereq(simInsts)
        ;

    simOps
-        .functor(BaseCPU::numSimulatedOps)
+        .functor(BaseCPU::GlobalStats::numSimulatedOps)
        .precision(0)
        .prereq(simOps)
        ;
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -156,6 +156,30 @@ class BaseCPU : public ClockedObject

        statistics::Formula hostInstRate;
        statistics::Formula hostOpRate;
+
+        Counter previousInsts = 0;
+        Counter previousOps = 0;
+
+        static Counter
+        numSimulatedInsts()
+        {
+            return totalNumSimulatedInsts() - (globalStats->previousInsts);
+        }
+
+        static Counter
+        numSimulatedOps()
+        {
+            return totalNumSimulatedOps() - (globalStats->previousOps);
+        }
+
+        void
+        resetStats() override
+        {
+            previousInsts = totalNumSimulatedInsts();
+            previousOps = totalNumSimulatedOps();
+
+            statistics::Group::resetStats();
+        }
    };

    /**
@@ -609,7 +633,7 @@ class BaseCPU : public ClockedObject

    static int numSimulatedCPUs() { return cpuList.size(); }
    static Counter
-    numSimulatedInsts()
+    totalNumSimulatedInsts()
    {
        Counter total = 0;

@@ -621,7 +645,7 @@ class BaseCPU : public ClockedObject
    }

    static Counter
-    numSimulatedOps()
+    totalNumSimulatedOps()
    {
        Counter total = 0;

--- a/src/cpu/o3/FUPool.py
+++ b/src/cpu/o3/FUPool.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 ARM Limited
+# Copyright (c) 2017, 2024 Arm Limited
 # All rights reserved
 #
 # The license below extends only to copyright in the software and shall
@@ -57,6 +57,7 @@ class DefaultFUPool(FUPool):
        FP_MultDiv(),
        ReadPort(),
        SIMD_Unit(),
+        Matrix_Unit(),
        PredALU(),
        WritePort(),
        RdWrPort(),
--- a/src/cpu/o3/FuncUnitConfig.py
+++ b/src/cpu/o3/FuncUnitConfig.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2010, 2017, 2020 ARM Limited
+# Copyright (c) 2010, 2017, 2020, 2024 Arm Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -109,10 +109,27 @@ class SIMD_Unit(FUDesc):
        OpDesc(opClass="SimdExt"),
        OpDesc(opClass="SimdFloatExt"),
        OpDesc(opClass="SimdConfig"),
+        OpDesc(opClass="SimdAes"),
+        OpDesc(opClass="SimdAesMix"),
+        OpDesc(opClass="SimdSha1Hash"),
+        OpDesc(opClass="SimdSha1Hash2"),
+        OpDesc(opClass="SimdSha256Hash"),
+        OpDesc(opClass="SimdSha256Hash2"),
+        OpDesc(opClass="SimdShaSigma2"),
+        OpDesc(opClass="SimdShaSigma3"),
    ]
    count = 4


+class Matrix_Unit(FUDesc):
+    opList = [
+        OpDesc(opClass="Matrix"),
+        OpDesc(opClass="MatrixMov"),
+        OpDesc(opClass="MatrixOP"),
+    ]
+    count = 1
+
+
 class PredALU(FUDesc):
    opList = [OpDesc(opClass="SimdPredAlu")]
    count = 1
--- a/src/cpu/o3/probe/elastic_trace.cc
+++ b/src/cpu/o3/probe/elastic_trace.cc
@@ -122,7 +122,7 @@ ElasticTrace::regEtraceListeners()
 {
    assert(!allProbesReg);
    inform("@%llu: No. of instructions committed = %llu, registering elastic"
-        " probe listeners", curTick(), cpu->numSimulatedInsts());
+        " probe listeners", curTick(), cpu->totalNumSimulatedInsts());
    // Create new listeners: provide method to be called upon a notify() for
    // each probe point.
    listeners.push_back(new ProbeListenerArg<ElasticTrace, RequestPtr>(this,
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -38,6 +38,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 from m5.objects.ClockedObject import ClockedObject
+from m5.objects.IndexingPolicies import *
+from m5.objects.ReplacementPolicies import *
 from m5.params import *
 from m5.proxy import *
 from m5.SimObject import *
@@ -83,6 +85,38 @@ class BranchTargetBuffer(ClockedObject):
    numThreads = Param.Unsigned(Parent.numThreads, "Number of threads")


+class BTBIndexingPolicy(SimObject):
+    type = "BTBIndexingPolicy"
+    abstract = True
+    cxx_class = "gem5::IndexingPolicyTemplate<gem5::BTBTagType>"
+    cxx_header = "cpu/pred/btb_entry.hh"
+    cxx_template_params = ["class Types"]
+
+    # Get the associativity
+    assoc = Param.Int(Parent.assoc, "associativity")
+
+
+class BTBSetAssociative(BTBIndexingPolicy):
+    type = "BTBSetAssociative"
+    cxx_class = "gem5::BTBSetAssociative"
+    cxx_header = "cpu/pred/btb_entry.hh"
+
+    # Get the number of entries in the BTB from the parent
+    num_entries = Param.Unsigned(
+        Parent.numEntries, "Number of entries in the BTB"
+    )
+
+    # Set shift for the index. Ignore lower 2 bits for a 4 byte instruction.
+    set_shift = Param.Unsigned(2, "Number of bits to shift PC to get index")
+
+    # Total number of bits in the tag.
+    # This is above the index and offset bit
+    tag_bits = Param.Unsigned(64, "number of bits in the tag")
+
+    # Number of threads sharing the BTB
+    numThreads = Param.Unsigned(Parent.numThreads, "Number of threads")
+
+
 class SimpleBTB(BranchTargetBuffer):
    type = "SimpleBTB"
    cxx_class = "gem5::branch_prediction::SimpleBTB"
@@ -93,6 +127,19 @@ class SimpleBTB(BranchTargetBuffer):
    instShiftAmt = Param.Unsigned(
        Parent.instShiftAmt, "Number of bits to shift instructions by"
    )
+    associativity = Param.Unsigned(1, "BTB associativity")
+    btbReplPolicy = Param.BaseReplacementPolicy(
+        LRURP(), "BTB replacement policy"
+    )
+    btbIndexingPolicy = Param.BTBIndexingPolicy(
+        BTBSetAssociative(
+            assoc=Parent.associativity,
+            num_entries=Parent.numEntries,
+            set_shift=Parent.instShiftAmt,
+            numThreads=1,
+        ),
+        "BTB indexing policy",
+    )


 class IndirectPredictor(SimObject):
--- a/src/cpu/pred/SConscript
+++ b/src/cpu/pred/SConscript
@@ -45,7 +45,7 @@ SimObject('BranchPredictor.py',
    sim_objects=[
    'BranchPredictor',
    'IndirectPredictor', 'SimpleIndirectPredictor',
-    'BranchTargetBuffer', 'SimpleBTB',
+    'BranchTargetBuffer', 'SimpleBTB', 'BTBIndexingPolicy', 'BTBSetAssociative',
    'ReturnAddrStack',
    'LocalBP', 'TournamentBP', 'BiModeBP', 'TAGEBase', 'TAGE', 'LoopPredictor',
    'TAGE_SC_L_TAGE', 'TAGE_SC_L_TAGE_64KB', 'TAGE_SC_L_TAGE_8KB',
--- a/src/cpu/pred/btb_entry.hh
+++ b/src/cpu/pred/btb_entry.hh
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2024 Pranith Kumar
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Declaration of a BTB entry and BTB indexing policy.
+ */
+
+#ifndef __CPU_PRED_BTB_ENTRY_HH__
+#define __CPU_PRED_BTB_ENTRY_HH__
+
+#include <vector>
+
+#include "arch/generic/pcstate.hh"
+#include "base/intmath.hh"
+#include "base/types.hh"
+#include "cpu/static_inst.hh"
+#include "mem/cache/replacement_policies/replaceable_entry.hh"
+#include "mem/cache/tags/indexing_policies/base.hh"
+#include "params/BTBIndexingPolicy.hh"
+#include "params/BTBSetAssociative.hh"
+
+namespace gem5 {
+
+class BTBTagType
+{
+  public:
+    struct KeyType
+    {
+        Addr address;
+        ThreadID tid;
+    };
+    using Params = BTBIndexingPolicyParams;
+};
+
+using BTBIndexingPolicy = IndexingPolicyTemplate<BTBTagType>;
+template class IndexingPolicyTemplate<BTBTagType>;
+
+class BTBSetAssociative : public BTBIndexingPolicy
+{
+  public:
+    PARAMS(BTBSetAssociative);
+    using KeyType = BTBTagType::KeyType;
+
+    BTBSetAssociative(const Params &p)
+        : BTBIndexingPolicy(p, p.num_entries, p.set_shift),
+          tagMask(mask(p.tag_bits))
+    {
+        setNumThreads(p.numThreads);
+    }
+
+  protected:
+    /**
+     * Extract the set index for the instruction PC based on tid.
+     */
+    uint32_t
+    extractSet(const KeyType &key) const
+    {
+        return ((key.address >> setShift)
+                ^ (key.tid << (tagShift - setShift - log2NumThreads)))
+            & setMask;
+    }
+
+  public:
+    /**
+     * Find all possible entries for insertion and replacement of an address.
+     */
+    std::vector<ReplaceableEntry*>
+    getPossibleEntries(const KeyType &key) const override
+    {
+        auto set_idx = extractSet(key);
+
+        assert(set_idx < sets.size());
+
+        return sets[set_idx];
+    }
+
+    /**
+     * Set number of threads sharing the BTB
+     */
+    void
+    setNumThreads(unsigned num_threads)
+    {
+        log2NumThreads = log2i(num_threads);
+    }
+
+    /**
+     * Generate the tag from the given address.
+     */
+    Addr
+    extractTag(const Addr addr) const override
+    {
+        return (addr >> tagShift) & tagMask;
+    }
+
+    Addr regenerateAddr(const KeyType &key,
+                        const ReplaceableEntry* entry) const override
+    {
+        panic("Not implemented!");
+        return 0;
+    }
+
+  private:
+    const uint64_t tagMask;
+    unsigned log2NumThreads;
+};
+
+namespace branch_prediction
+{
+
+class BTBEntry : public ReplaceableEntry
+{
+  public:
+    using IndexingPolicy = gem5::BTBIndexingPolicy;
+    using KeyType = gem5::BTBTagType::KeyType;
+    using TagExtractor = std::function<Addr(Addr)>;
+
+    /** Default constructor */
+    BTBEntry(TagExtractor ext)
+        : inst(nullptr), extractTag(ext), valid(false), tag({MaxAddr, -1})
+    {}
+
+    /** Update the target and instruction in the BTB entry.
+     *  During insertion, only the tag (key) is updated.
+     */
+    void
+    update(const PCStateBase &_target,
+           StaticInstPtr _inst)
+    {
+        set(target, _target);
+        inst = _inst;
+    }
+
+    /**
+     * Checks if the given tag information corresponds to this entry's.
+     */
+    bool
+    match(const KeyType &key) const
+    {
+        return isValid() && (tag.address == extractTag(key.address))
+            && (tag.tid == key.tid);
+    }
+
+    /**
+     * Insert the block by assigning it a tag and marking it valid. Touches
+     * block if it hadn't been touched previously.
+     */
+    void
+    insert(const KeyType &key)
+    {
+        setValid();
+        setTag({extractTag(key.address), key.tid});
+    }
+
+    /** Copy constructor */
+    BTBEntry(const BTBEntry &other)
+    {
+        valid      = other.valid;
+        tag        = other.tag;
+        inst       = other.inst;
+        extractTag = other.extractTag;
+        set(target, other.target);
+    }
+
+    /** Assignment operator */
+    BTBEntry& operator=(const BTBEntry &other)
+    {
+        valid      = other.valid;
+        tag        = other.tag;
+        inst       = other.inst;
+        extractTag = other.extractTag;
+        set(target, other.target);
+
+        return *this;
+    }
+
+    /**
+     * Checks if the entry is valid.
+     */
+    bool isValid() const { return valid; }
+
+    /**
+     * Get tag associated to this block.
+     */
+    KeyType getTag() const { return tag; }
+
+    /** Invalidate the block. Its contents are no longer valid. */
+    void
+    invalidate()
+    {
+        valid = false;
+        setTag({MaxAddr, -1});
+    }
+
+    /** The entry's target. */
+    std::unique_ptr<PCStateBase> target;
+
+    /** Pointer to the static branch inst at this address */
+    StaticInstPtr inst;
+
+    std::string
+    print() const override
+    {
+        return csprintf("tag: %#x tid: %d valid: %d | %s", tag.address, tag.tid,
+                        isValid(), ReplaceableEntry::print());
+    }
+
+  protected:
+    /**
+     * Set tag associated to this block.
+     */
+    void setTag(KeyType _tag) { tag = _tag; }
+
+    /** Set valid bit. The block must be invalid beforehand. */
+    void
+    setValid()
+    {
+        assert(!isValid());
+        valid = true;
+    }
+
+  private:
+    /** Callback used to extract the tag from the entry */
+    TagExtractor extractTag;
+
+    /**
+     * Valid bit. The contents of this entry are only valid if this bit is set.
+     * @sa invalidate()
+     * @sa insert()
+     */
+    bool valid;
+
+    /** The entry's tag. */
+    KeyType tag;
+};
+
+} // namespace gem5::branch_prediction
+/**
+ * This helper generates a tag extractor function object
+ * which will be typically used by Replaceable entries indexed
+ * with the BaseIndexingPolicy.
+ * It allows to "decouple" indexing from tagging. Those entries
+ * would call the functor without directly holding a pointer
+ * to the indexing policy which should reside in the cache.
+ */
+static constexpr auto
+genTagExtractor(BTBIndexingPolicy *ip)
+{
+    return [ip] (Addr addr) { return ip->extractTag(addr); };
+}
+
+}
+
+#endif //__CPU_PRED_BTB_ENTRY_HH__
--- a/src/cpu/pred/simple_btb.cc
+++ b/src/cpu/pred/simple_btb.cc
@@ -44,84 +44,38 @@
 #include "base/trace.hh"
 #include "debug/BTB.hh"

-namespace gem5
-{
-
-namespace branch_prediction
+namespace gem5::branch_prediction
 {

 SimpleBTB::SimpleBTB(const SimpleBTBParams &p)
    : BranchTargetBuffer(p),
-        numEntries(p.numEntries),
-        tagBits(p.tagBits),
-        instShiftAmt(p.instShiftAmt),
-        log2NumThreads(floorLog2(p.numThreads))
+      btb("simpleBTB", p.numEntries, p.associativity,
+          p.btbReplPolicy, p.btbIndexingPolicy,
+          BTBEntry(genTagExtractor(p.btbIndexingPolicy)))
 {
    DPRINTF(BTB, "BTB: Creating BTB object.\n");

-    if (!isPowerOf2(numEntries)) {
+    if (!isPowerOf2(p.numEntries)) {
        fatal("BTB entries is not a power of 2!");
    }
-
-    btb.resize(numEntries);
-
-    for (unsigned i = 0; i < numEntries; ++i) {
-        btb[i].valid = false;
-    }
-
-    idxMask = numEntries - 1;
-
-    tagMask = (1 << tagBits) - 1;
-
-    tagShiftAmt = instShiftAmt + floorLog2(numEntries);
 }

 void
 SimpleBTB::memInvalidate()
 {
-    for (unsigned i = 0; i < numEntries; ++i) {
-        btb[i].valid = false;
-    }
+    btb.clear();
 }

-inline
-unsigned
-SimpleBTB::getIndex(Addr instPC, ThreadID tid)
-{
-    // Need to shift PC over by the word offset.
-    return ((instPC >> instShiftAmt)
-            ^ (tid << (tagShiftAmt - instShiftAmt - log2NumThreads)))
-            & idxMask;
-}
-
-inline
-Addr
-SimpleBTB::getTag(Addr instPC)
-{
-    return (instPC >> tagShiftAmt) & tagMask;
-}
-
-SimpleBTB::BTBEntry *
+BTBEntry *
 SimpleBTB::findEntry(Addr instPC, ThreadID tid)
 {
-    unsigned btb_idx = getIndex(instPC, tid);
-    Addr inst_tag = getTag(instPC);
-
-    assert(btb_idx < numEntries);
-
-    if (btb[btb_idx].valid
-        && inst_tag == btb[btb_idx].tag
-        && btb[btb_idx].tid == tid) {
-        return &btb[btb_idx];
-    }
-
-    return nullptr;
+    return btb.findEntry({instPC, tid});
 }

 bool
 SimpleBTB::valid(ThreadID tid, Addr instPC)
 {
-    BTBEntry *entry = findEntry(instPC, tid);
+    BTBEntry *entry = btb.findEntry({instPC, tid});

    return entry != nullptr;
 }
@@ -134,11 +88,12 @@ SimpleBTB::lookup(ThreadID tid, Addr instPC, BranchType type)
 {
    stats.lookups[type]++;

-    BTBEntry *entry = findEntry(instPC, tid);
+    BTBEntry *entry = btb.accessEntry({instPC, tid});

    if (entry) {
        return entry->target.get();
    }
+
    stats.misses[type]++;
    return nullptr;
 }
@@ -146,11 +101,12 @@ SimpleBTB::lookup(ThreadID tid, Addr instPC, BranchType type)
 const StaticInstPtr
 SimpleBTB::getInst(ThreadID tid, Addr instPC)
 {
-    BTBEntry *entry = findEntry(instPC, tid);
+    BTBEntry *entry = btb.findEntry({instPC, tid});

    if (entry) {
        return entry->inst;
    }
+
    return nullptr;
 }

@@ -159,18 +115,13 @@ SimpleBTB::update(ThreadID tid, Addr instPC,
                  const PCStateBase &target,
                  BranchType type, StaticInstPtr inst)
 {
-    unsigned btb_idx = getIndex(instPC, tid);
-
-    assert(btb_idx < numEntries);
-
    stats.updates[type]++;

-    btb[btb_idx].tid = tid;
-    btb[btb_idx].valid = true;
-    set(btb[btb_idx].target, target);
-    btb[btb_idx].tag = getTag(instPC);
-    btb[btb_idx].inst = inst;
+    BTBEntry *victim = btb.findVictim({instPC, tid});
+
+    btb.insertEntry({instPC, tid}, victim);
+    victim->update(target, inst);
 }

-} // namespace branch_prediction
-} // namespace gem5
+
+} // namespace gem5::branch_prediction
--- a/src/cpu/pred/simple_btb.hh
+++ b/src/cpu/pred/simple_btb.hh
@@ -41,15 +41,16 @@
 #ifndef __CPU_PRED_SIMPLE_BTB_HH__
 #define __CPU_PRED_SIMPLE_BTB_HH__

+#include "base/cache/associative_cache.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
 #include "cpu/pred/btb.hh"
+#include "cpu/pred/btb_entry.hh"
+#include "mem/cache/replacement_policies/replaceable_entry.hh"
+#include "mem/cache/tags/indexing_policies/base.hh"
 #include "params/SimpleBTB.hh"

-namespace gem5
-{
-
-namespace branch_prediction
+namespace gem5::branch_prediction
 {

 class SimpleBTB : public BranchTargetBuffer
@@ -66,38 +67,7 @@ class SimpleBTB : public BranchTargetBuffer
                StaticInstPtr inst = nullptr) override;
    const StaticInstPtr getInst(ThreadID tid, Addr instPC) override;

-
  private:
-    struct BTBEntry
-    {
-        /** The entry's tag. */
-        Addr tag = 0;
-
-        /** The entry's target. */
-        std::unique_ptr<PCStateBase> target;
-
-        /** The entry's thread id. */
-        ThreadID tid;
-
-        /** Whether or not the entry is valid. */
-        bool valid = false;
-
-        /** Pointer to the static branch instruction at this address */
-        StaticInstPtr inst = nullptr;
-    };
-
-
-    /** Returns the index into the BTB, based on the branch's PC.
-     *  @param inst_PC The branch to look up.
-     *  @return Returns the index into the BTB.
-     */
-    inline unsigned getIndex(Addr instPC, ThreadID tid);
-
-    /** Returns the tag bits of a given address.
-     *  @param inst_PC The branch's address.
-     *  @return Returns the tag bits.
-     */
-    inline Addr getTag(Addr instPC);

    /** Internal call to find an address in the BTB
     * @param instPC The branch's address.
@@ -106,31 +76,9 @@ class SimpleBTB : public BranchTargetBuffer
    BTBEntry *findEntry(Addr instPC, ThreadID tid);

    /** The actual BTB. */
-    std::vector<BTBEntry> btb;
-
-    /** The number of entries in the BTB. */
-    unsigned numEntries;
-
-    /** The index mask. */
-    unsigned idxMask;
-
-    /** The number of tag bits per entry. */
-    unsigned tagBits;
-
-    /** The tag mask. */
-    unsigned tagMask;
-
-    /** Number of bits to shift PC when calculating index. */
-    unsigned instShiftAmt;
-
-    /** Number of bits to shift PC when calculating tag. */
-    unsigned tagShiftAmt;
-
-    /** Log2 NumThreads used for hashing threadid */
-    unsigned log2NumThreads;
+    AssociativeCache<BTBEntry> btb;
 };

-} // namespace branch_prediction
-} // namespace gem5
+} // namespace gem5::branch_prediction

 #endif // __CPU_PRED_SIMPLE_BTB_HH__
--- a/src/cpu/testers/gpu_ruby_test/TesterThread.py
+++ b/src/cpu/testers/gpu_ruby_test/TesterThread.py
@@ -41,3 +41,4 @@ class TesterThread(ClockedObject):
    thread_id = Param.Int("Unique TesterThread ID")
    num_lanes = Param.Int("Number of lanes this thread has")
    deadlock_threshold = Param.Cycles(1000000000, "Deadlock threshold")
+    cache_line_size = Param.UInt32("Size of cache line in cache")
--- a/src/cpu/testers/gpu_ruby_test/address_manager.cc
+++ b/src/cpu/testers/gpu_ruby_test/address_manager.cc
@@ -64,7 +64,9 @@ AddressManager::AddressManager(int n_atomic_locs, int n_normal_locs_per_atomic)
    std::shuffle(
        randAddressMap.begin(),
        randAddressMap.end(),
-        std::default_random_engine(random_mt.random<unsigned>(0,UINT_MAX))
+        // TODO: This is a bug unrelated to this draft PR but the GPU tester is
+        // useful for testing this PR.
+        std::default_random_engine(random_mt.random<unsigned>(0,UINT_MAX-1))
    );

    // initialize atomic locations
--- a/src/cpu/testers/gpu_ruby_test/dma_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/dma_thread.cc
@@ -70,7 +70,7 @@ DmaThread::issueLoadOps()
        Addr address = addrManager->getAddress(location);
        DPRINTF(ProtocolTest, "%s Episode %d: Issuing Load - Addr %s\n",
                this->getName(), curEpisode->getEpisodeId(),
-                ruby::printAddress(address));
+                printAddress(address));

        int load_size = sizeof(Value);

@@ -127,7 +127,7 @@ DmaThread::issueStoreOps()

        DPRINTF(ProtocolTest, "%s Episode %d: Issuing Store - Addr %s - "
                "Value %d\n", this->getName(),
-                curEpisode->getEpisodeId(), ruby::printAddress(address),
+                curEpisode->getEpisodeId(), printAddress(address),
                new_value);

        auto req = std::make_shared<Request>(address, sizeof(Value),
@@ -211,7 +211,7 @@ DmaThread::hitCallback(PacketPtr pkt)

    DPRINTF(ProtocolTest, "%s Episode %d: hitCallback - Command %s -"
            " Addr %s\n", this->getName(), curEpisode->getEpisodeId(),
-            resp_cmd.toString(), ruby::printAddress(addr));
+            resp_cmd.toString(), printAddress(addr));

    if (resp_cmd == MemCmd::SwapResp) {
        // response to a pending atomic
--- a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
+++ b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
@@ -67,7 +67,7 @@ GpuWavefront::issueLoadOps()
            Addr address = addrManager->getAddress(location);
            DPRINTF(ProtocolTest, "%s Episode %d: Issuing Load - Addr %s\n",
                    this->getName(), curEpisode->getEpisodeId(),
-                    ruby::printAddress(address));
+                    printAddress(address));

            int load_size = sizeof(Value);

@@ -124,7 +124,7 @@ GpuWavefront::issueStoreOps()

            DPRINTF(ProtocolTest, "%s Episode %d: Issuing Store - Addr %s - "
                    "Value %d\n", this->getName(),
-                    curEpisode->getEpisodeId(), ruby::printAddress(address),
+                    curEpisode->getEpisodeId(), printAddress(address),
                    new_value);

            auto req = std::make_shared<Request>(address, sizeof(Value),
@@ -178,7 +178,7 @@ GpuWavefront::issueAtomicOps()

        DPRINTF(ProtocolTest, "%s Episode %d: Issuing Atomic_Inc - Addr %s\n",
                this->getName(), curEpisode->getEpisodeId(),
-                ruby::printAddress(address));
+                printAddress(address));

        // must be aligned with store size
        assert(address % sizeof(Value) == 0);
@@ -268,7 +268,7 @@ GpuWavefront::hitCallback(PacketPtr pkt)
    DPRINTF(ProtocolTest, "%s Episode %d: hitCallback - Command %s - "
                    "Addr %s\n", this->getName(),
                    curEpisode->getEpisodeId(), resp_cmd.toString(),
-                    ruby::printAddress(addr));
+                    printAddress(addr));

    // whether the transaction is done after this hitCallback
    bool isTransactionDone = true;
--- a/src/cpu/testers/gpu_ruby_test/tester_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/tester_thread.cc
@@ -43,6 +43,7 @@ TesterThread::TesterThread(const Params &p)
      : ClockedObject(p),
        threadEvent(this, "TesterThread tick"),
        deadlockCheckEvent(this),
+        cacheLineSize(p.cache_line_size),
        threadId(p.thread_id),
        numLanes(p.num_lanes),
        tester(nullptr), addrManager(nullptr), port(nullptr),
@@ -383,7 +384,7 @@ TesterThread::validateAtomicResp(Location loc, int lane, Value ret_val)
        ss << threadName << ": Atomic Op returned unexpected value\n"
           << "\tEpisode " << curEpisode->getEpisodeId() << "\n"
           << "\tLane ID " << lane << "\n"
-           << "\tAddress " << ruby::printAddress(addr) << "\n"
+           << "\tAddress " << printAddress(addr) << "\n"
           << "\tAtomic Op's return value " << ret_val << "\n";

        // print out basic info
@@ -409,7 +410,7 @@ TesterThread::validateLoadResp(Location loc, int lane, Value ret_val)
           << "\tTesterThread " << threadId << "\n"
           << "\tEpisode " << curEpisode->getEpisodeId() << "\n"
           << "\tLane ID " << lane << "\n"
-           << "\tAddress " << ruby::printAddress(addr) << "\n"
+           << "\tAddress " << printAddress(addr) << "\n"
           << "\tLoaded value " << ret_val << "\n"
           << "\tLast writer " << addrManager->printLastWriter(loc) << "\n";

@@ -467,7 +468,7 @@ TesterThread::printOutstandingReqs(const OutstandingReqTable& table,

    for (const auto& m : table) {
        for (const auto& req : m.second) {
-            ss << "\t\t\tAddr " << ruby::printAddress(m.first)
+            ss << "\t\t\tAddr " << printAddress(m.first)
               << ": delta (curCycle - issueCycle) = "
               << (cur_cycle - req.issueCycle) << std::endl;
        }
@@ -488,4 +489,10 @@ TesterThread::printAllOutstandingReqs(std::stringstream& ss) const
       << pendingFenceCount << std::endl;
 }

+std::string
+TesterThread::printAddress(Addr addr) const
+{
+    return ruby::printAddress(addr, cacheLineSize * 8);
+}
+
 } // namespace gem5
--- a/src/cpu/testers/gpu_ruby_test/tester_thread.hh
+++ b/src/cpu/testers/gpu_ruby_test/tester_thread.hh
@@ -132,6 +132,7 @@ class TesterThread : public ClockedObject
        {}
    };

+    int cacheLineSize;
    // the unique global id of this thread
    int threadId;
    // width of this thread (1 for cpu thread & wf size for gpu wavefront)
@@ -204,6 +205,7 @@ class TesterThread : public ClockedObject

    void printOutstandingReqs(const OutstandingReqTable& table,
                              std::stringstream& ss) const;
+    std::string printAddress(Addr addr) const;
 };

 } // namespace gem5
--- a/src/cpu/testers/rubytest/Check.cc
+++ b/src/cpu/testers/rubytest/Check.cc
@@ -124,7 +124,8 @@ Check::initiatePrefetch()

    // push the subblock onto the sender state.  The sequencer will
    // update the subblock on the return
-    pkt->senderState = new SenderState(m_address, req->getSize());
+    pkt->senderState = new SenderState(m_address, req->getSize(),
+                                       CACHE_LINE_BITS);

    if (port->sendTimingReq(pkt)) {
        DPRINTF(RubyTest, "successfully initiated prefetch.\n");
@@ -161,7 +162,8 @@ Check::initiateFlush()

    // push the subblock onto the sender state.  The sequencer will
    // update the subblock on the return
-    pkt->senderState = new SenderState(m_address, req->getSize());
+    pkt->senderState = new SenderState(m_address, req->getSize(),
+                                       CACHE_LINE_BITS);

    if (port->sendTimingReq(pkt)) {
        DPRINTF(RubyTest, "initiating Flush - successful\n");
@@ -207,7 +209,8 @@ Check::initiateAction()

    // push the subblock onto the sender state.  The sequencer will
    // update the subblock on the return
-    pkt->senderState = new SenderState(writeAddr, req->getSize());
+    pkt->senderState = new SenderState(m_address, req->getSize(),
+                                       CACHE_LINE_BITS);

    if (port->sendTimingReq(pkt)) {
        DPRINTF(RubyTest, "initiating action - successful\n");
@@ -261,7 +264,8 @@ Check::initiateCheck()

    // push the subblock onto the sender state.  The sequencer will
    // update the subblock on the return
-    pkt->senderState = new SenderState(m_address, req->getSize());
+    pkt->senderState = new SenderState(m_address, req->getSize(),
+                                       CACHE_LINE_BITS);

    if (port->sendTimingReq(pkt)) {
        DPRINTF(RubyTest, "initiating check - successful\n");
@@ -291,7 +295,9 @@ Check::performCallback(ruby::NodeID proc, ruby::SubBlock* data, Cycles curTime)
    // This isn't exactly right since we now have multi-byte checks
    //  assert(getAddress() == address);

-    assert(ruby::makeLineAddress(m_address) == ruby::makeLineAddress(address));
+    int block_size_bits = CACHE_LINE_BITS;
+    assert(ruby::makeLineAddress(m_address, block_size_bits) ==
+           ruby::makeLineAddress(address, block_size_bits));
    assert(data != NULL);

    DPRINTF(RubyTest, "RubyTester Callback\n");
@@ -342,7 +348,7 @@ Check::performCallback(ruby::NodeID proc, ruby::SubBlock* data, Cycles curTime)
    }

    DPRINTF(RubyTest, "proc: %d, Address: 0x%x\n", proc,
-            ruby::makeLineAddress(m_address));
+            ruby::makeLineAddress(m_address, block_size_bits));
    DPRINTF(RubyTest, "Callback done\n");
    debugPrint();
 }
--- a/src/cpu/testers/rubytest/Check.hh
+++ b/src/cpu/testers/rubytest/Check.hh
@@ -47,6 +47,7 @@ class SubBlock;

 const int CHECK_SIZE_BITS = 2;
 const int CHECK_SIZE = (1 << CHECK_SIZE_BITS);
+const int CACHE_LINE_BITS = 6;

 class Check
 {
--- a/src/cpu/testers/rubytest/RubyTester.hh
+++ b/src/cpu/testers/rubytest/RubyTester.hh
@@ -90,7 +90,9 @@ class RubyTester : public ClockedObject
    {
        ruby::SubBlock subBlock;

-        SenderState(Addr addr, int size) : subBlock(addr, size) {}
+        SenderState(Addr addr, int size, int cl_size)
+            : subBlock(addr, size, cl_size)
+        {}

    };

--- a/src/dev/amdgpu/AMDGPU.py
+++ b/src/dev/amdgpu/AMDGPU.py
@@ -81,8 +81,6 @@ class AMDGPUDevice(PciDevice):
    InterruptPin = 2
    ExpansionROM = 0

-    rom_binary = Param.String("ROM binary dumped from hardware")
-    trace_file = Param.String("MMIO trace collected on hardware")
    checkpoint_before_mmios = Param.Bool(
        False, "Take a checkpoint before the device begins sending MMIOs"
    )
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -58,12 +58,6 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
      init_interrupt_count(0), _lastVMID(0),
      deviceMem(name() + ".deviceMem", p.memories, false, "", false)
 {
-    // Loading the rom binary dumped from hardware.
-    std::ifstream romBin;
-    romBin.open(p.rom_binary, std::ios::binary);
-    romBin.read((char *)rom.data(), ROM_SIZE);
-    romBin.close();
-
    // System pointer needs to be explicitly set for device memory since
    // DRAMCtrl uses it to get (1) cache line size and (2) the mem mode.
    // Note this means the cache line size is system wide.
@@ -92,10 +86,6 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
        panic("Unknown GPU device %s\n", p.device_name);
    }

-    if (p.trace_file != "") {
-        mmioReader.readMMIOTrace(p.trace_file);
-    }
-
    int sdma_id = 0;
    for (auto& s : p.sdmas) {
        s->setGPUDevice(this);
--- a/src/dev/pci/host.cc
+++ b/src/dev/pci/host.cc
@@ -168,10 +168,15 @@ GenericPciHost::write(PacketPtr pkt)
            pkt->getSize());

    PciDevice *const pci_dev(getDevice(dev_addr.first));
-    panic_if(!pci_dev,
+    warn_if(!pci_dev,
            "%02x:%02x.%i: Write to config space on non-existent PCI device\n",
            dev_addr.first.bus, dev_addr.first.dev, dev_addr.first.func);

+    if (!pci_dev) {
+        pkt->makeAtomicResponse();
+        return 20000; // 20ns default from PciDevice.py
+    }
+
    // @todo Remove this after testing
    pkt->headerDelay = pkt->payloadDelay = 0;

--- a/src/dev/riscv/clint.cc
+++ b/src/dev/riscv/clint.cc
@@ -53,7 +53,7 @@ Clint::Clint(const Params &params) :
    BasicPioDevice(params, params.pio_size),
    system(params.system),
    nThread(params.num_threads),
-    signal(params.name + ".signal", 0, this),
+    signal(params.name + ".signal", 0, this, INT_RTC),
    reset(params.name + ".reset"),
    resetMtimecmp(params.reset_mtimecmp),
    registers(params.name + ".registers", params.pio_addr, this,
@@ -69,9 +69,11 @@ Clint::Clint(const Params &params) :
 void
 Clint::raiseInterruptPin(int id)
 {
-    // Increment mtime
+    // Increment mtime when received RTC signal
    uint64_t& mtime = registers.mtime.get();
+    if (id == INT_RTC) {
        mtime++;
+    }

    for (int context_id = 0; context_id < nThread; context_id++) {

@@ -261,7 +263,7 @@ Clint::doReset() {
        registers.msip[i].reset();
    }
    // We need to update the mtip interrupt bits when reset
-    raiseInterruptPin(0);
+    raiseInterruptPin(INT_RESET);
 }

 } // namespace gem5
--- a/src/dev/riscv/clint.hh
+++ b/src/dev/riscv/clint.hh
@@ -91,6 +91,13 @@ class Clint : public BasicPioDevice
    void raiseInterruptPin(int id);
    void lowerInterruptPin(int id) {}

+  // Interrupt ID
+  enum InterruptId
+  {
+      INT_RTC = 0, // received from RTC(signal port)
+      INT_RESET, // received from reset port
+  };
+
  // Register bank
  public:

--- a/src/dev/virtio/base.hh
+++ b/src/dev/virtio/base.hh
@@ -477,7 +477,7 @@ class VirtQueue : public Serializable
            Index index;
        };

-        VirtRing<T>(PortProxy &proxy, ByteOrder bo, uint16_t size) :
+        VirtRing(PortProxy &proxy, ByteOrder bo, uint16_t size) :
            header{0, 0}, ring(size), _proxy(proxy), _base(0), byteOrder(bo)
        {}

@@ -550,7 +550,7 @@ class VirtQueue : public Serializable

      private:
        // Remove default constructor
-        VirtRing<T>();
+        VirtRing();

        /** Guest physical memory proxy */
        PortProxy &_proxy;
--- a/src/mem/cache/cache_blk.hh
+++ b/src/mem/cache/cache_blk.hh
@@ -461,7 +461,7 @@ class CacheBlk : public TaggedEntry

  protected:
    /** The current coherence status of this block. @sa CoherenceBits */
-    unsigned coherence;
+    unsigned coherence = 0;

    // The following setters have been marked as protected because their
    // respective variables should only be modified at 2 moments:
--- a/src/mem/cache/prefetch/Prefetcher.py
+++ b/src/mem/cache/prefetch/Prefetcher.py
@@ -599,6 +599,22 @@ class BOPPrefetcher(QueuedPrefetcher):
    on_inst = False


+class SmsPrefetcher(QueuedPrefetcher):
+    # Paper: https://web.eecs.umich.edu/~twenisch/papers/isca06.pdf
+    type = "SmsPrefetcher"
+    cxx_class = "gem5::prefetch::Sms"
+    cxx_header = "mem/cache/prefetch/sms.hh"
+    ft_size = Param.Unsigned(64, "Size of Filter and Active generation table")
+    pht_size = Param.Unsigned(16384, "Size of pattern history table")
+    region_size = Param.Unsigned(4096, "Spatial region size")
+
+    queue_squash = True
+    queue_filter = True
+    cache_snoop = True
+    prefetch_on_access = True
+    on_inst = False
+
+
 class SBOOEPrefetcher(QueuedPrefetcher):
    type = "SBOOEPrefetcher"
    cxx_class = "gem5::prefetch::SBOOE"
--- a/src/mem/cache/prefetch/SConscript
+++ b/src/mem/cache/prefetch/SConscript
@@ -31,8 +31,9 @@ Import('*')
 SimObject('Prefetcher.py', sim_objects=[
    'BasePrefetcher', 'MultiPrefetcher', 'QueuedPrefetcher',
    'StridePrefetcherHashedSetAssociative', 'StridePrefetcher',
-    'TaggedPrefetcher', 'IndirectMemoryPrefetcher', 'SignaturePathPrefetcher',
-    'SignaturePathPrefetcherV2', 'AccessMapPatternMatching', 'AMPMPrefetcher',
+    'SmsPrefetcher', 'TaggedPrefetcher', 'IndirectMemoryPrefetcher',
+    'SignaturePathPrefetcher', 'SignaturePathPrefetcherV2',
+    'AccessMapPatternMatching', 'AMPMPrefetcher',
    'DeltaCorrelatingPredictionTables', 'DCPTPrefetcher',
    'IrregularStreamBufferPrefetcher', 'SlimAMPMPrefetcher',
    'BOPPrefetcher', 'SBOOEPrefetcher', 'STeMSPrefetcher', 'PIFPrefetcher'])
@@ -47,6 +48,7 @@ Source('indirect_memory.cc')
 Source('pif.cc')
 Source('queued.cc')
 Source('sbooe.cc')
+Source('sms.cc')
 Source('signature_path.cc')
 Source('signature_path_v2.cc')
 Source('slim_ampm.cc')
--- a/src/mem/cache/prefetch/sms.cc
+++ b/src/mem/cache/prefetch/sms.cc
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Describes a SMS prefetcher based on template policies.
+ */
+
+#include "mem/cache/prefetch/sms.hh"
+
+#include "debug/HWPrefetch.hh"
+#include "params/SmsPrefetcher.hh"
+
+namespace gem5
+{
+
+namespace prefetch
+{
+
+Sms::Sms(const SmsPrefetcherParams &p)
+    : Queued(p), Max_Contexts(p.ft_size), MAX_PHTSize(p.pht_size),
+      Region_Size(p.region_size)
+{
+    AGT.clear();
+    AGTPC.clear();
+    FT.clear();
+    PHT.clear();
+    fifoFT.clear();
+    lruAGT.clear();
+    lruPHT.clear();
+
+}
+void
+Sms::notifyEvict(const EvictionInfo &info)
+{
+    //Check if any active generation has ended
+    Addr region_base = roundDown(info.addr, Region_Size);
+    std::pair <Addr,Addr> pc_offset = AGTPC[region_base];
+    if (AGT.find(region_base) != AGT.end()) {
+        //remove old recording
+        if (PHT.find(pc_offset) != PHT.end()) {
+            PHT[pc_offset].clear();
+        }
+        //Move from AGT to PHT
+        for (std::set<Addr>::iterator it = AGT[region_base].begin();
+         it != AGT[region_base].end(); it ++) {
+            PHT[pc_offset].insert(*it);
+        }
+        lruPHT.push_front(pc_offset);
+    }
+
+    while (PHT.size() > MAX_PHTSize) {
+        PHT.erase(lruPHT.back());
+        lruPHT.pop_back();
+    }
+
+    AGTPC.erase(region_base);
+    AGT.erase(region_base);
+}
+void
+Sms::calculatePrefetch(const PrefetchInfo &pfi,
+    std::vector<AddrPriority> &addresses,
+    const CacheAccessor &cache)
+{
+
+    if (!pfi.hasPC()) {
+        DPRINTF(HWPrefetch, "Ignoring request with no PC.\n");
+        return;
+    }
+
+    Addr blk_addr = blockAddress(pfi.getAddr());
+    Addr pc = pfi.getPC();
+    Addr region_base = roundDown(blk_addr, Region_Size);
+    Addr offset = blk_addr - region_base;
+
+    //Training
+    if (AGT.find(region_base) != AGT.end()) {
+        assert (FT.find(region_base) == FT.end());
+        // Record Pattern
+        AGT[region_base].insert(offset);
+        //update LRU
+        for (std::deque <Addr>::iterator lit = lruAGT.begin();
+         lit != lruAGT.end(); lit ++) {
+            if ((*lit) == region_base) {
+                lruAGT.erase(lit);
+                lruAGT.push_front(region_base);
+                break;
+            }
+        }
+    } else if (FT.find(region_base) != FT.end()) {
+        //move entry from FT to AGT
+        AGT[region_base].insert(FT[region_base].second);
+        AGTPC[region_base] = FT[region_base];
+        lruAGT.push_front(region_base);
+        //Record latest offset
+        AGT[region_base].insert(offset);
+        //Recycle FT entry
+        FT.erase(region_base);
+        //Make space for next entry
+        while (AGT.size() > Max_Contexts) {
+            AGT.erase(lruAGT.back());
+            AGTPC.erase(lruAGT.back());
+            lruAGT.pop_back();
+        }
+    } else {
+        // Trigger Access
+        FT[region_base] = std::make_pair (pc,offset);
+        fifoFT.push_front(region_base);
+        while (FT.size() > Max_Contexts) {
+            FT.erase(fifoFT.back());
+            fifoFT.pop_back();
+        }
+    }
+
+    //Prediction
+    std::pair <Addr, Addr> pc_offset = std::make_pair(pc,offset);
+    if (PHT.find(pc_offset) != PHT.end()) {
+        for (std::set<Addr>::iterator it = PHT[pc_offset].begin();
+         it != PHT[pc_offset].end(); it ++) {
+            Addr pref_addr = blockAddress(region_base + (*it));
+            addresses.push_back(AddrPriority(pref_addr,0));
+        }
+        for (std::deque < std::pair <Addr,Addr> >::iterator lit
+         = lruPHT.begin(); lit != lruPHT.end(); lit ++) {
+            if ((*lit) == pc_offset) {
+                    lruPHT.erase(lit);
+                    lruPHT.push_front(pc_offset);
+                    break;
+            }
+        }
+    }
+
+}
+
+} // namespace prefetch
+} // namespace gem5
--- a/src/mem/cache/prefetch/sms.hh
+++ b/src/mem/cache/prefetch/sms.hh
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Describes a SMS prefetcher.
+ */
+
+#ifndef __MEM_CACHE_PREFETCH_SMS_HH__
+#define __MEM_CACHE_PREFETCH_SMS_HH__
+
+#include <set>
+
+#include "mem/cache/prefetch/queued.hh"
+#include "mem/packet.hh"
+
+namespace gem5
+{
+
+struct SmsPrefetcherParams;
+
+namespace prefetch
+{
+
+
+class Sms : public Queued
+{
+
+  private:
+    const int Max_Contexts; //= 64;
+    const uint64_t MAX_PHTSize; //= 512;
+    const Addr Region_Size; //= 4096;
+
+    std::map< Addr, std::set<Addr> > AGT;
+    std::map< Addr, std::pair<Addr,Addr> > AGTPC;
+    std::map< Addr, std::pair<Addr,Addr> > FT;
+    std::map< std::pair <Addr,Addr> , std::set<Addr> > PHT;
+    std::deque<Addr> fifoFT;
+    std::deque<Addr> lruAGT;
+    std::deque< std::pair <Addr,Addr> > lruPHT;
+
+    using EvictionInfo = CacheDataUpdateProbeArg;
+    void notifyEvict(const EvictionInfo &info) override;
+
+  public:
+    Sms(const SmsPrefetcherParams &p);
+    ~Sms() = default;
+
+    void calculatePrefetch(const PrefetchInfo &pfi,
+                           std::vector<AddrPriority> &addresses,
+                           const CacheAccessor &cache) override;
+};
+
+} // namespace prefetch
+} // namespace gem5
+
+#endif // __MEM_CACHE_PREFETCH_SMS_HH__
--- a/src/mem/ruby/common/Address.cc
+++ b/src/mem/ruby/common/Address.cc
@@ -51,37 +51,33 @@ maskLowOrderBits(Addr addr, unsigned int number)
 }

 Addr
-getOffset(Addr addr)
+getOffset(Addr addr, int cacheLineBits)
 {
-    return bitSelect(addr, 0, RubySystem::getBlockSizeBits() - 1);
-}
-
-Addr
-makeLineAddress(Addr addr)
-{
-    return mbits<Addr>(addr, 63, RubySystem::getBlockSizeBits());
+    assert(cacheLineBits < 64);
+    return bitSelect(addr, 0, cacheLineBits - 1);
 }

 Addr
 makeLineAddress(Addr addr, int cacheLineBits)
 {
+    assert(cacheLineBits < 64);
    return maskLowOrderBits(addr, cacheLineBits);
 }

 // returns the next stride address based on line address
 Addr
-makeNextStrideAddress(Addr addr, int stride)
+makeNextStrideAddress(Addr addr, int stride, int cacheLineBytes)
 {
-    return makeLineAddress(addr) +
-        static_cast<int>(RubySystem::getBlockSizeBytes()) * stride;
+    return makeLineAddress(addr, floorLog2(cacheLineBytes))
+           + cacheLineBytes * stride;
 }

 std::string
-printAddress(Addr addr)
+printAddress(Addr addr, int cacheLineBits)
 {
    std::stringstream out;
    out << "[" << std::hex << "0x" << addr << "," << " line 0x"
-       << makeLineAddress(addr) << std::dec << "]";
+       << makeLineAddress(addr, cacheLineBits) << std::dec << "]";
    return out.str();
 }

--- a/src/mem/ruby/common/Address.hh
+++ b/src/mem/ruby/common/Address.hh
@@ -33,6 +33,7 @@
 #include <iomanip>
 #include <iostream>

+#include "base/intmath.hh"
 #include "base/types.hh"

 namespace gem5
@@ -44,11 +45,10 @@ namespace ruby
 // selects bits inclusive
 Addr bitSelect(Addr addr, unsigned int small, unsigned int big);
 Addr maskLowOrderBits(Addr addr, unsigned int number);
-Addr getOffset(Addr addr);
-Addr makeLineAddress(Addr addr);
+Addr getOffset(Addr addr, int cacheLineBits);
 Addr makeLineAddress(Addr addr, int cacheLineBits);
-Addr makeNextStrideAddress(Addr addr, int stride);
-std::string printAddress(Addr addr);
+Addr makeNextStrideAddress(Addr addr, int stride, int cacheLineBytes);
+std::string printAddress(Addr addr, int cacheLineBits);

 } // namespace ruby
 } // namespace gem5
--- a/src/mem/ruby/common/DataBlock.cc
+++ b/src/mem/ruby/common/DataBlock.cc
@@ -40,8 +40,8 @@

 #include "mem/ruby/common/DataBlock.hh"

+#include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/WriteMask.hh"
-#include "mem/ruby/system/RubySystem.hh"

 namespace gem5
 {
@@ -51,17 +51,22 @@ namespace ruby

 DataBlock::DataBlock(const DataBlock &cp)
 {
+    assert(cp.isAlloc());
+    assert(cp.getBlockSize() > 0);
+    assert(!m_alloc);
+
    uint8_t *block_update;
-    size_t block_bytes = RubySystem::getBlockSizeBytes();
-    m_data = new uint8_t[block_bytes];
-    memcpy(m_data, cp.m_data, block_bytes);
+    m_block_size = cp.getBlockSize();
+    m_data = new uint8_t[m_block_size];
+    memcpy(m_data, cp.m_data, m_block_size);
    m_alloc = true;
+    m_block_size = m_block_size;
    // If this data block is involved in an atomic operation, the effect
    // of applying the atomic operations on the data block are recorded in
    // m_atomicLog. If so, we must copy over every entry in the change log
    for (size_t i = 0; i < cp.m_atomicLog.size(); i++) {
-        block_update = new uint8_t[block_bytes];
-        memcpy(block_update, cp.m_atomicLog[i], block_bytes);
+        block_update = new uint8_t[m_block_size];
+        memcpy(block_update, cp.m_atomicLog[i], m_block_size);
        m_atomicLog.push_back(block_update);
    }
 }
@@ -69,21 +74,44 @@ DataBlock::DataBlock(const DataBlock &cp)
 void
 DataBlock::alloc()
 {
-    m_data = new uint8_t[RubySystem::getBlockSizeBytes()];
+    assert(!m_alloc);
+
+    if (!m_block_size) {
+        return;
+    }
+
+    m_data = new uint8_t[m_block_size];
    m_alloc = true;
    clear();
 }

+void
+DataBlock::realloc(int blk_size)
+{
+    m_block_size = blk_size;
+    assert(m_block_size > 0);
+
+    if (m_alloc) {
+        delete [] m_data;
+        m_alloc = false;
+    }
+    alloc();
+}
+
 void
 DataBlock::clear()
 {
-    memset(m_data, 0, RubySystem::getBlockSizeBytes());
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    memset(m_data, 0, m_block_size);
 }

 bool
 DataBlock::equal(const DataBlock& obj) const
 {
-    size_t block_bytes = RubySystem::getBlockSizeBytes();
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    size_t block_bytes = m_block_size;
    // Check that the block contents match
    if (memcmp(m_data, obj.m_data, block_bytes)) {
        return false;
@@ -102,7 +130,9 @@ DataBlock::equal(const DataBlock& obj) const
 void
 DataBlock::copyPartial(const DataBlock &dblk, const WriteMask &mask)
 {
-    for (int i = 0; i < RubySystem::getBlockSizeBytes(); i++) {
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    for (int i = 0; i < m_block_size; i++) {
        if (mask.getMask(i, 1)) {
            m_data[i] = dblk.m_data[i];
        }
@@ -113,7 +143,9 @@ void
 DataBlock::atomicPartial(const DataBlock &dblk, const WriteMask &mask,
        bool isAtomicNoReturn)
 {
-    for (int i = 0; i < RubySystem::getBlockSizeBytes(); i++) {
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    for (int i = 0; i < m_block_size; i++) {
        m_data[i] = dblk.m_data[i];
    }
    mask.performAtomic(m_data, m_atomicLog, isAtomicNoReturn);
@@ -122,7 +154,9 @@ DataBlock::atomicPartial(const DataBlock &dblk, const WriteMask &mask,
 void
 DataBlock::print(std::ostream& out) const
 {
-    int size = RubySystem::getBlockSizeBytes();
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    int size = m_block_size;
    out << "[ ";
    for (int i = 0; i < size; i++) {
        out << std::setw(2) << std::setfill('0') << std::hex
@@ -147,6 +181,7 @@ DataBlock::popAtomicLogEntryFront()
 void
 DataBlock::clearAtomicLogEntries()
 {
+    assert(m_alloc);
    for (auto log : m_atomicLog) {
        delete [] log;
    }
@@ -156,35 +191,59 @@ DataBlock::clearAtomicLogEntries()
 const uint8_t*
 DataBlock::getData(int offset, int len) const
 {
-    assert(offset + len <= RubySystem::getBlockSizeBytes());
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    assert(offset + len <= m_block_size);
    return &m_data[offset];
 }

 uint8_t*
 DataBlock::getDataMod(int offset)
 {
+    assert(m_alloc);
    return &m_data[offset];
 }

 void
 DataBlock::setData(const uint8_t *data, int offset, int len)
 {
+    assert(m_alloc);
    memcpy(&m_data[offset], data, len);
 }

 void
 DataBlock::setData(PacketPtr pkt)
 {
-    int offset = getOffset(pkt->getAddr());
-    assert(offset + pkt->getSize() <= RubySystem::getBlockSizeBytes());
+    assert(m_alloc);
+    assert(m_block_size > 0);
+    int offset = getOffset(pkt->getAddr(), floorLog2(m_block_size));
+    assert(offset + pkt->getSize() <= m_block_size);
    pkt->writeData(&m_data[offset]);
 }

 DataBlock &
 DataBlock::operator=(const DataBlock & obj)
 {
+    // Reallocate if needed
+    if (m_alloc && m_block_size != obj.getBlockSize()) {
+        delete [] m_data;
+        m_block_size = obj.getBlockSize();
+        alloc();
+    } else if (!m_alloc) {
+        m_block_size = obj.getBlockSize();
+        alloc();
+
+        // Assume this will be realloc'd later if zero.
+        if (m_block_size == 0) {
+            return *this;
+        }
+    } else {
+        assert(m_alloc && m_block_size == obj.getBlockSize());
+    }
+    assert(m_block_size > 0);
+
    uint8_t *block_update;
-    size_t block_bytes = RubySystem::getBlockSizeBytes();
+    size_t block_bytes = m_block_size;
    // Copy entire block contents from obj to current block
    memcpy(m_data, obj.m_data, block_bytes);
    // If this data block is involved in an atomic operation, the effect
--- a/src/mem/ruby/common/DataBlock.hh
+++ b/src/mem/ruby/common/DataBlock.hh
@@ -61,8 +61,14 @@ class WriteMask;
 class DataBlock
 {
  public:
-    DataBlock()
+    // Ideally this should nost be called. We allow default so that protocols
+    // do not need to be changed.
+    DataBlock() = default;
+
+    DataBlock(int blk_size)
    {
+        assert(!m_alloc);
+        m_block_size = blk_size;
        alloc();
    }

@@ -101,10 +107,16 @@ class DataBlock
    bool equal(const DataBlock& obj) const;
    void print(std::ostream& out) const;

+    int getBlockSize() const { return m_block_size; }
+    void setBlockSize(int block_size) { realloc(block_size); }
+    bool isAlloc() const { return m_alloc; }
+    void realloc(int blk_size);
+
  private:
    void alloc();
-    uint8_t *m_data;
-    bool m_alloc;
+    uint8_t *m_data = nullptr;
+    bool m_alloc = false;
+    int m_block_size = 0;

    // Tracks block changes when atomic ops are applied
    std::deque<uint8_t*> m_atomicLog;
@@ -124,18 +136,21 @@ DataBlock::assign(uint8_t *data)
 inline uint8_t
 DataBlock::getByte(int whichByte) const
 {
+    assert(m_alloc);
    return m_data[whichByte];
 }

 inline void
 DataBlock::setByte(int whichByte, uint8_t data)
 {
+    assert(m_alloc);
    m_data[whichByte] = data;
 }

 inline void
 DataBlock::copyPartial(const DataBlock & dblk, int offset, int len)
 {
+    assert(m_alloc);
    setData(&dblk.m_data[offset], offset, len);
 }

--- a/src/mem/ruby/common/NetDest.cc
+++ b/src/mem/ruby/common/NetDest.cc
@@ -30,6 +30,8 @@

 #include <algorithm>

+#include "mem/ruby/system/RubySystem.hh"
+
 namespace gem5
 {

@@ -37,6 +39,11 @@ namespace ruby
 {

 NetDest::NetDest()
+{
+}
+
+NetDest::NetDest(RubySystem *ruby_system)
+    : m_ruby_system(ruby_system)
 {
    resize();
 }
@@ -44,6 +51,7 @@ NetDest::NetDest()
 void
 NetDest::add(MachineID newElement)
 {
+    assert(m_bits.size() > 0);
    assert(bitIndex(newElement.num) < m_bits[vecIndex(newElement)].getSize());
    m_bits[vecIndex(newElement)].add(bitIndex(newElement.num));
 }
@@ -51,6 +59,7 @@ NetDest::add(MachineID newElement)
 void
 NetDest::addNetDest(const NetDest& netDest)
 {
+    assert(m_bits.size() > 0);
    assert(m_bits.size() == netDest.getSize());
    for (int i = 0; i < m_bits.size(); i++) {
        m_bits[i].addSet(netDest.m_bits[i]);
@@ -60,6 +69,8 @@ NetDest::addNetDest(const NetDest& netDest)
 void
 NetDest::setNetDest(MachineType machine, const Set& set)
 {
+    assert(m_ruby_system != nullptr);
+
    // assure that there is only one set of destinations for this machine
    assert(MachineType_base_level((MachineType)(machine + 1)) -
           MachineType_base_level(machine) == 1);
@@ -69,12 +80,14 @@ NetDest::setNetDest(MachineType machine, const Set& set)
 void
 NetDest::remove(MachineID oldElement)
 {
+    assert(m_bits.size() > 0);
    m_bits[vecIndex(oldElement)].remove(bitIndex(oldElement.num));
 }

 void
 NetDest::removeNetDest(const NetDest& netDest)
 {
+    assert(m_bits.size() > 0);
    assert(m_bits.size() == netDest.getSize());
    for (int i = 0; i < m_bits.size(); i++) {
        m_bits[i].removeSet(netDest.m_bits[i]);
@@ -84,6 +97,7 @@ NetDest::removeNetDest(const NetDest& netDest)
 void
 NetDest::clear()
 {
+    assert(m_bits.size() > 0);
    for (int i = 0; i < m_bits.size(); i++) {
        m_bits[i].clear();
    }
@@ -101,6 +115,8 @@ NetDest::broadcast()
 void
 NetDest::broadcast(MachineType machineType)
 {
+    assert(m_ruby_system != nullptr);
+
    for (NodeID i = 0; i < MachineType_base_count(machineType); i++) {
        MachineID mach = {machineType, i};
        add(mach);
@@ -111,6 +127,9 @@ NetDest::broadcast(MachineType machineType)
 std::vector<NodeID>
 NetDest::getAllDest()
 {
+    assert(m_ruby_system != nullptr);
+    assert(m_bits.size() > 0);
+
    std::vector<NodeID> dest;
    dest.clear();
    for (int i = 0; i < m_bits.size(); i++) {
@@ -127,6 +146,8 @@ NetDest::getAllDest()
 int
 NetDest::count() const
 {
+    assert(m_bits.size() > 0);
+
    int counter = 0;
    for (int i = 0; i < m_bits.size(); i++) {
        counter += m_bits[i].count();
@@ -137,12 +158,14 @@ NetDest::count() const
 NodeID
 NetDest::elementAt(MachineID index)
 {
+    assert(m_bits.size() > 0);
    return m_bits[vecIndex(index)].elementAt(bitIndex(index.num));
 }

 MachineID
 NetDest::smallestElement() const
 {
+    assert(m_bits.size() > 0);
    assert(count() > 0);
    for (int i = 0; i < m_bits.size(); i++) {
        for (NodeID j = 0; j < m_bits[i].getSize(); j++) {
@@ -158,6 +181,9 @@ NetDest::smallestElement() const
 MachineID
 NetDest::smallestElement(MachineType machine) const
 {
+    assert(m_bits.size() > 0);
+    assert(m_ruby_system != nullptr);
+
    int size = m_bits[MachineType_base_level(machine)].getSize();
    for (NodeID j = 0; j < size; j++) {
        if (m_bits[MachineType_base_level(machine)].isElement(j)) {
@@ -173,6 +199,7 @@ NetDest::smallestElement(MachineType machine) const
 bool
 NetDest::isBroadcast() const
 {
+    assert(m_bits.size() > 0);
    for (int i = 0; i < m_bits.size(); i++) {
        if (!m_bits[i].isBroadcast()) {
            return false;
@@ -185,6 +212,7 @@ NetDest::isBroadcast() const
 bool
 NetDest::isEmpty() const
 {
+    assert(m_bits.size() > 0);
    for (int i = 0; i < m_bits.size(); i++) {
        if (!m_bits[i].isEmpty()) {
            return false;
@@ -197,8 +225,9 @@ NetDest::isEmpty() const
 NetDest
 NetDest::OR(const NetDest& orNetDest) const
 {
+    assert(m_bits.size() > 0);
    assert(m_bits.size() == orNetDest.getSize());
-    NetDest result;
+    NetDest result(m_ruby_system);
    for (int i = 0; i < m_bits.size(); i++) {
        result.m_bits[i] = m_bits[i].OR(orNetDest.m_bits[i]);
    }
@@ -209,8 +238,9 @@ NetDest::OR(const NetDest& orNetDest) const
 NetDest
 NetDest::AND(const NetDest& andNetDest) const
 {
+    assert(m_bits.size() > 0);
    assert(m_bits.size() == andNetDest.getSize());
-    NetDest result;
+    NetDest result(m_ruby_system);
    for (int i = 0; i < m_bits.size(); i++) {
        result.m_bits[i] = m_bits[i].AND(andNetDest.m_bits[i]);
    }
@@ -221,6 +251,7 @@ NetDest::AND(const NetDest& andNetDest) const
 bool
 NetDest::intersectionIsNotEmpty(const NetDest& other_netDest) const
 {
+    assert(m_bits.size() > 0);
    assert(m_bits.size() == other_netDest.getSize());
    for (int i = 0; i < m_bits.size(); i++) {
        if (!m_bits[i].intersectionIsEmpty(other_netDest.m_bits[i])) {
@@ -233,6 +264,7 @@ NetDest::intersectionIsNotEmpty(const NetDest& other_netDest) const
 bool
 NetDest::isSuperset(const NetDest& test) const
 {
+    assert(m_bits.size() > 0);
    assert(m_bits.size() == test.getSize());

    for (int i = 0; i < m_bits.size(); i++) {
@@ -246,12 +278,15 @@ NetDest::isSuperset(const NetDest& test) const
 bool
 NetDest::isElement(MachineID element) const
 {
+    assert(m_bits.size() > 0);
    return ((m_bits[vecIndex(element)])).isElement(bitIndex(element.num));
 }

 void
 NetDest::resize()
 {
+    assert(m_ruby_system != nullptr);
+
    m_bits.resize(MachineType_base_level(MachineType_NUM));
    assert(m_bits.size() == MachineType_NUM);

@@ -263,6 +298,7 @@ NetDest::resize()
 void
 NetDest::print(std::ostream& out) const
 {
+    assert(m_bits.size() > 0);
    out << "[NetDest (" << m_bits.size() << ") ";

    for (int i = 0; i < m_bits.size(); i++) {
@@ -277,6 +313,7 @@ NetDest::print(std::ostream& out) const
 bool
 NetDest::isEqual(const NetDest& n) const
 {
+    assert(m_bits.size() > 0);
    assert(m_bits.size() == n.m_bits.size());
    for (unsigned int i = 0; i < m_bits.size(); ++i) {
        if (!m_bits[i].isEqual(n.m_bits[i]))
@@ -285,5 +322,19 @@ NetDest::isEqual(const NetDest& n) const
    return true;
 }

+int
+NetDest::MachineType_base_count(const MachineType& obj)
+{
+    assert(m_ruby_system != nullptr);
+    return m_ruby_system->MachineType_base_count(obj);
+}
+
+int
+NetDest::MachineType_base_number(const MachineType& obj)
+{
+    assert(m_ruby_system != nullptr);
+    return m_ruby_system->MachineType_base_number(obj);
+}
+
 } // namespace ruby
 } // namespace gem5
--- a/src/mem/ruby/common/NetDest.hh
+++ b/src/mem/ruby/common/NetDest.hh
@@ -41,6 +41,8 @@ namespace gem5
 namespace ruby
 {

+class RubySystem;
+
 // NetDest specifies the network destination of a Message
 class NetDest
 {
@@ -48,6 +50,7 @@ class NetDest
    // Constructors
    // creates and empty set
    NetDest();
+    NetDest(RubySystem *ruby_system);
    explicit NetDest(int bit_size);

    NetDest& operator=(const Set& obj);
@@ -98,6 +101,8 @@ class NetDest

    void print(std::ostream& out) const;

+    void setRubySystem(RubySystem *rs) { m_ruby_system = rs; resize(); }
+
  private:
    // returns a value >= MachineType_base_level("this machine")
    // and < MachineType_base_level("next highest machine")
@@ -112,6 +117,12 @@ class NetDest
    NodeID bitIndex(NodeID index) const { return index; }

    std::vector<Set> m_bits;  // a vector of bit vectors - i.e. Sets
+
+    // Needed to call MacheinType_base_count/level
+    RubySystem *m_ruby_system = nullptr;
+
+    int MachineType_base_count(const MachineType& obj);
+    int MachineType_base_number(const MachineType& obj);
 };

 inline std::ostream&
--- a/src/mem/ruby/common/SubBlock.cc
+++ b/src/mem/ruby/common/SubBlock.cc
@@ -38,13 +38,14 @@ namespace ruby

 using stl_helpers::operator<<;

-SubBlock::SubBlock(Addr addr, int size)
+SubBlock::SubBlock(Addr addr, int size, int cl_bits)
 {
    m_address = addr;
    resize(size);
    for (int i = 0; i < size; i++) {
        setByte(i, 0);
    }
+    m_cache_line_bits = cl_bits;
 }

 void
@@ -52,7 +53,7 @@ SubBlock::internalMergeFrom(const DataBlock& data)
 {
    int size = getSize();
    assert(size > 0);
-    int offset = getOffset(m_address);
+    int offset = getOffset(m_address, m_cache_line_bits);
    for (int i = 0; i < size; i++) {
        this->setByte(i, data.getByte(offset + i));
    }
@@ -63,7 +64,7 @@ SubBlock::internalMergeTo(DataBlock& data) const
 {
    int size = getSize();
    assert(size > 0);
-    int offset = getOffset(m_address);
+    int offset = getOffset(m_address, m_cache_line_bits);
    for (int i = 0; i < size; i++) {
        // This will detect crossing a cache line boundary
        data.setByte(offset + i, this->getByte(i));
--- a/src/mem/ruby/common/SubBlock.hh
+++ b/src/mem/ruby/common/SubBlock.hh
@@ -45,7 +45,7 @@ class SubBlock
 {
  public:
    SubBlock() { }
-    SubBlock(Addr addr, int size);
+    SubBlock(Addr addr, int size, int cl_bits);
    ~SubBlock() { }

    Addr getAddress() const { return m_address; }
@@ -74,6 +74,7 @@ class SubBlock
    // Data Members (m_ prefix)
    Addr m_address;
    std::vector<uint8_t> m_data;
+    int m_cache_line_bits;
 };

 inline std::ostream&
--- a/Show More
+++ b/Show More