Merge branch 'develop' into requirements-fixer-hook

2023-10-09 22:37:25 -07:00
parent 402ec3a57c 486916b5d4
commit da212e04b5
23 changed files with 1385 additions and 923 deletions
--- a/.github/workflows/ci-tests.yaml
+++ b/.github/workflows/ci-tests.yaml
@@ -1,3 +1,4 @@
+---
 # This workflow runs after a pull-request has been approved by a reviewer.

 name: CI Tests
@@ -75,12 +76,12 @@ jobs:
              run: apt install -y jq

            - name: Get directories for testlib-quick
-        working-directory: "${{ github.workspace }}/tests"
+              working-directory: ${{ github.workspace }}/tests
              id: dir-matrix
              run: echo "test-dirs-matrix=$(find gem5/* -type d -maxdepth 0 | jq -ncR '[inputs]')" >>$GITHUB_OUTPUT

            - name: Get the build targets for testlib-quick-gem5-builds
-        working-directory: "${{ github.workspace }}/tests"
+              working-directory: ${{ github.workspace }}/tests
              id: build-matrix
              run: echo "build-matrix=$(./main.py list --build-targets -q | jq -ncR '[inputs]')" >>$GITHUB_OUTPUT

@@ -130,10 +131,7 @@ jobs:
                test-dir: ${{ fromJson(needs.testlib-quick-matrix.outputs.test-dirs-matrix) }}
        steps:
            - name: Clean runner
-        run:
-          rm -rf ./* || true
-          rm -rf ./.??* || true
-          rm -rf ~/.cache || true
+              run: rm -rf ./* || true rm -rf ./.??* || true rm -rf ~/.cache || true

        # Checkout the repository then download the gem5.opt artifact.
            - uses: actions/checkout@v3
@@ -165,7 +163,8 @@ jobs:
              if: success() || failure()
              uses: actions/upload-artifact@v3
              with:
-          name: ci-tests-run-${{ github.run_number }}-attempt-${{ github.run_attempt }}-testlib-quick-${{ steps.sanitize-test-dir.outputs.sanatized-test-dir }}-status-${{ steps.run-tests.outcome }}-output
+                  name: ci-tests-run-${{ github.run_number }}-attempt-${{ github.run_attempt }}-testlib-quick-${{ steps.sanitize-test-dir.outputs.sanatized-test-dir
+                      }}-status-${{ steps.run-tests.outcome }}-output
                  path: tests/testing-results
                  retention-days: 30

--- a/.github/workflows/compiler-tests.yaml
+++ b/.github/workflows/compiler-tests.yaml
@@ -1,3 +1,4 @@
+---
 # This workflow runs all of the compiler tests

 name: Compiler Tests
@@ -5,7 +6,7 @@ name: Compiler Tests
 on:
  # Runs every Friday from 7AM UTC
    schedule:
-    - cron:  '00 7 * * 5'
+        - cron: 00 7 * * 5
  # Allows us to manually start workflow for testing
    workflow_dispatch:

@@ -15,7 +16,9 @@ jobs:
        strategy:
            fail-fast: false
            matrix:
-        image: [gcc-version-12, gcc-version-11, gcc-version-10, gcc-version-9, gcc-version-8, clang-version-16, clang-version-15, clang-version-14, clang-version-13, clang-version-12, clang-version-11, clang-version-10, clang-version-9, clang-version-8, clang-version-7, ubuntu-20.04_all-dependencies, ubuntu-22.04_all-dependencies, ubuntu-22.04_min-dependencies]
+                image: [gcc-version-12, gcc-version-11, gcc-version-10, gcc-version-9, gcc-version-8, clang-version-16, clang-version-15, clang-version-14,
+                    clang-version-13, clang-version-12, clang-version-11, clang-version-10, clang-version-9, clang-version-8, clang-version-7, ubuntu-20.04_all-dependencies,
+                    ubuntu-22.04_all-dependencies, ubuntu-22.04_min-dependencies]
                opts: [.opt, .fast]
        runs-on: [self-hosted, linux, x64, build]
        timeout-minutes: 2880 # 48 hours
@@ -35,7 +38,9 @@ jobs:
        strategy:
            fail-fast: false
            matrix:
-        gem5-compilation: [ARM, ARM_MESI_Three_Level, ARM_MESI_Three_Level_HTM, ARM_MOESI_hammer, Garnet_standalone, GCN3_X86, MIPS, 'NULL', NULL_MESI_Two_Level, NULL_MOESI_CMP_directory, NULL_MOESI_CMP_token, NULL_MOESI_hammer, POWER, RISCV, SPARC, X86, X86_MI_example, X86_MOESI_AMD_Base, VEGA_X86, GCN3_X86]
+                gem5-compilation: [ARM, ARM_MESI_Three_Level, ARM_MESI_Three_Level_HTM, ARM_MOESI_hammer, Garnet_standalone, GCN3_X86, MIPS, 'NULL', NULL_MESI_Two_Level,
+                    NULL_MOESI_CMP_directory, NULL_MOESI_CMP_token, NULL_MOESI_hammer, POWER, RISCV, SPARC, X86, X86_MI_example, X86_MOESI_AMD_Base, VEGA_X86,
+                    GCN3_X86]
                image: [gcc-version-12, clang-version-16]
                opts: [.opt]
        runs-on: [self-hosted, linux, x64, build]
--- a/.github/workflows/daily-tests.yaml
+++ b/.github/workflows/daily-tests.yaml
@@ -1,3 +1,4 @@
+---
 # This workflow runs all of the long tests within main.py, extra tests in nightly.sh, and unittests

 name: Daily Tests
@@ -5,7 +6,7 @@ name: Daily Tests
 on:
  # Runs every day from 7AM UTC
    schedule:
-    - cron:  '0 7 * * *'
+        - cron: 0 7 * * *

 jobs:
    name-artifacts:
@@ -22,7 +23,7 @@ jobs:
            fail-fast: false
            matrix:
        # NULL is in quotes since it is considered a keyword in yaml files
-        image: [ALL, ALL_CHI, ARM, ALL_MSI, ALL_MESI_Two_Level, "NULL", NULL_MI_example, RISCV, VEGA_X86]
+                image: [ALL, ALL_CHI, ARM, ALL_MSI, ALL_MESI_Two_Level, 'NULL', NULL_MI_example, RISCV, VEGA_X86]
        # this allows us to pass additional command line parameters
        # the default is to add -j $(nproc), but some images
        # require more specifications when built
@@ -76,17 +77,15 @@ jobs:
        strategy:
            fail-fast: false
            matrix:
-        test-type: [arm_boot_tests, fs, gpu, insttest_se, learning_gem5, m5threads_test_atomic, memory, multi_isa, replacement_policies, riscv_boot_tests, stdlib, x86_boot_tests]
+                test-type: [arm_boot_tests, fs, gpu, insttest_se, learning_gem5, m5threads_test_atomic, memory, multi_isa, replacement_policies, riscv_boot_tests,
+                    stdlib, x86_boot_tests]
        runs-on: [self-hosted, linux, x64, run]
        container: gcr.io/gem5-test/ubuntu-22.04_all-dependencies:latest
        needs: [name-artifacts, build-gem5]
        timeout-minutes: 1440 # 24 hours for entire matrix to run
        steps:
            - name: Clean runner
-      run:
-        rm -rf ./* || true
-        rm -rf ./.??* || true
-        rm -rf ~/.cache || true
+              run: rm -rf ./* || true rm -rf ./.??* || true rm -rf ~/.cache || true
            - uses: actions/checkout@v3
              with:
        # Scheduled workflows run on the default branch by default. We
@@ -168,16 +167,14 @@ jobs:
        strategy:
            fail-fast: false
            matrix:
-        test-type: [gem5-library-example-x86-ubuntu-run-ALL-x86_64-opt, gem5-library-example-riscv-ubuntu-run-ALL-x86_64-opt, lupv-example-ALL-x86_64-opt, gem5-library-example-arm-ubuntu-run-test-ALL-x86_64-opt, gem5-library-example-riscvmatched-hello-ALL-x86_64-opt]
+                test-type: [gem5-library-example-x86-ubuntu-run-ALL-x86_64-opt, gem5-library-example-riscv-ubuntu-run-ALL-x86_64-opt, lupv-example-ALL-x86_64-opt,
+                    gem5-library-example-arm-ubuntu-run-test-ALL-x86_64-opt, gem5-library-example-riscvmatched-hello-ALL-x86_64-opt]
        container: gcr.io/gem5-test/ubuntu-22.04_all-dependencies:latest
        needs: [name-artifacts, build-gem5]
        timeout-minutes: 1440 # 24 hours
        steps:
            - name: Clean runner
-      run:
-        rm -rf ./* || true
-        rm -rf ./.??* || true
-        rm -rf ~/.cache || true
+              run: rm -rf ./* || true rm -rf ./.??* || true rm -rf ~/.cache || true
            - uses: actions/checkout@v3
              with:
        # Scheduled workflows run on the default branch by default. We
@@ -190,7 +187,8 @@ jobs:
            - run: chmod u+x build/ALL/gem5.opt
            - name: long ${{ matrix.test-type }} gem5_library_example_tests
              working-directory: ${{ github.workspace }}/tests
-      run: ./main.py run --uid SuiteUID:tests/gem5/gem5_library_example_tests/test_gem5_library_examples.py:test-${{ matrix.test-type }} --length=long --skip-build -vv
+              run: ./main.py run --uid SuiteUID:tests/gem5/gem5_library_example_tests/test_gem5_library_examples.py:test-${{ matrix.test-type }} --length=long
+                  --skip-build -vv
            - name: create zip of results
              if: success() || failure()
              run: |
@@ -281,6 +279,8 @@ jobs:
              with:
                  args: -q http://dist.gem5.org/dist/develop/test-progs/heterosync/gcn3/allSyncPrims-1kernel # Removed -N bc it wasn't available within actions, should be okay bc workspace is clean every time
            - name: Run allSyncPrims-1kernel sleepMutex test with GCN3_X86/gem5.opt (SE mode)
-      run: ./build/GCN3_X86/gem5.opt configs/example/apu_se.py --reg-alloc-policy=dynamic -n3 -c allSyncPrims-1kernel --options="sleepMutex 10 16 4"
+              run: ./build/GCN3_X86/gem5.opt configs/example/apu_se.py --reg-alloc-policy=dynamic -n3 -c allSyncPrims-1kernel --options="sleepMutex 10 16
+                  4"
            - name: Run allSyncPrims-1kernel lfTreeBarrUsing test with GCN3_X86/gem5.opt (SE mode)
-      run: ./build/GCN3_X86/gem5.opt configs/example/apu_se.py --reg-alloc-policy=dynamic -n3 -c allSyncPrims-1kernel --options="lfTreeBarrUniq 10 16 4"
+              run: ./build/GCN3_X86/gem5.opt configs/example/apu_se.py --reg-alloc-policy=dynamic -n3 -c allSyncPrims-1kernel --options="lfTreeBarrUniq
+                  10 16 4"
--- a/.github/workflows/docker-build.yaml
+++ b/.github/workflows/docker-build.yaml
@@ -1,3 +1,4 @@
+---
 name: Docker images build and push

 on:
--- a/.github/workflows/gpu-tests.yaml
+++ b/.github/workflows/gpu-tests.yaml
@@ -1,3 +1,4 @@
+---
 # This workflow runs all of the very-long tests within main.py

 name: Weekly Tests
@@ -5,7 +6,7 @@ name: Weekly Tests
 on:
  # Runs every Sunday from 7AM UTC
    schedule:
-    - cron:  '00 7 * * 6'
+        - cron: 00 7 * * 6
  # Allows us to manually start workflow for testing
    workflow_dispatch:

--- a/.github/workflows/utils.yaml
+++ b/.github/workflows/utils.yaml
@@ -1,8 +1,9 @@
+---
 # This workflow file contains miscellaneous tasks to manage the repository.
 name: Utils for Repository
 on:
    schedule:
-    - cron: '30 1 * * *'
+        - cron: 30 1 * * *
    workflow_dispatch:

 jobs:
@@ -13,7 +14,8 @@ jobs:
        steps:
            - uses: actions/stale@v8.0.0
              with:
-          close-issue-message: 'This issue is being closed because it has been inactive waiting for response for 30 days. If this is still an issue, please open a new issue and reference this one.'
+                  close-issue-message: This issue is being closed because it has been inactive waiting for response for 30 days. If this is still an issue,
+                      please open a new issue and reference this one.
                  days-before-stale: 21
                  days-before-close: 7
-          any-of-labels: 'needs details'
+                  any-of-labels: needs details
--- a/.github/workflows/weekly-tests.yaml
+++ b/.github/workflows/weekly-tests.yaml
@@ -1,3 +1,4 @@
+---
 # This workflow runs all of the very-long tests within main.py

 name: Weekly Tests
@@ -5,7 +6,7 @@ name: Weekly Tests
 on:
  # Runs every Sunday from 7AM UTC
    schedule:
-    - cron:  '00 7 * * 6'
+        - cron: 00 7 * * 6
  # Allows us to manually start workflow for testing
    workflow_dispatch:

@@ -45,10 +46,7 @@ jobs:
        timeout-minutes: 4320 # 3 days
        steps:
            - name: Clean runner
-      run:
-        rm -rf ./* || true
-        rm -rf ./.??* || true
-        rm -rf ~/.cache || true
+              run: rm -rf ./* || true rm -rf ./.??* || true rm -rf ~/.cache || true
            - uses: actions/checkout@v3
              with:
        # Scheduled workflows run on the default branch by default. We
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,4 @@
+---
 # Copyright (c) 2022 Arm Limited
 # All rights reserved.
 #
@@ -33,7 +34,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-minimum_pre_commit_version: "2.18"
+minimum_pre_commit_version: '2.18'

 default_language_version:
    python: python3
@@ -61,8 +62,12 @@ repos:
          - id: check-added-large-files
          - id: mixed-line-ending
            args: [--fix=lf]
-  - id: requirements-txt-fixer
          - id: check-case-conflict
+          - id: requirements-txt-fixer
+    - repo: https://github.com/jumanjihouse/pre-commit-hook-yamlfmt
+      rev: 0.2.3
+      hooks:
+          - id: yamlfmt
    - repo: https://github.com/psf/black
      rev: 22.6.0
      hooks:
@@ -73,18 +78,18 @@ repos:
            name: gem5 style checker
            entry: util/git-pre-commit.py
            always_run: true
-    exclude: ".*"
+            exclude: .*
            language: system
-    description: 'The gem5 style checker hook.'
+            description: The gem5 style checker hook.
          - id: gem5-commit-msg-checker
            name: gem5 commit msg checker
            entry: ext/git-commit-msg
            language: system
            stages: [commit-msg]
-    description: 'The gem5 commit message checker hook.'
+            description: The gem5 commit message checker hook.
          - id: gerrit-commit-msg-job
            name: gerrit commit message job
            entry: util/gerrit-commit-msg-hook
            language: system
            stages: [commit-msg]
-    description: 'Adds Change-ID to the commit message. Needed by Gerrit.'
+            description: Adds Change-ID to the commit message. Needed by Gerrit.
--- a/MAINTAINERS.yaml
+++ b/MAINTAINERS.yaml
@@ -1,3 +1,4 @@
+---
 # See CONTRIBUTING.md for details of gem5's contribution process.
 #
 # This file contains a list of gem5's subsystems and their
--- a/configs/common/cores/arm/O3_ARM_Etrace.py
+++ b/configs/common/cores/arm/O3_ARM_Etrace.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2012, 2017-2018, 2023 Arm Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.objects import *
+from .O3_ARM_v7a import O3_ARM_v7a_3
+
+# O3_ARM_v7a_3 adapted to generate elastic traces
+class O3_ARM_v7a_3_Etrace(O3_ARM_v7a_3):
+    # Make the number of entries in the ROB, LQ and SQ very
+    # large so that there are no stalls due to resource
+    # limitation as such stalls will get captured in the trace
+    # as compute delay. For replay, ROB, LQ and SQ sizes are
+    # modelled in the Trace CPU.
+    numROBEntries = 512
+    LQEntries = 128
+    SQEntries = 128
+
+    def attach_probe_listener(self, inst_trace_file, data_trace_file):
+        # Attach the elastic trace probe listener. Set the protobuf trace
+        # file names. Set the dependency window size equal to the cpu it
+        # is attached to.
+        self.traceListener = m5.objects.ElasticTrace(
+            instFetchTraceFile=inst_trace_file,
+            dataDepTraceFile=data_trace_file,
+            depWindowSize=3 * self.numROBEntries,
+        )
--- a/configs/example/arm/devices.py
+++ b/configs/example/arm/devices.py
@@ -338,56 +338,15 @@ class FastmodelCluster(CpuCluster):
        pass


-class BaseSimpleSystem(ArmSystem):
-    cache_line_size = 64
-
-    def __init__(self, mem_size, platform, **kwargs):
-        super(BaseSimpleSystem, self).__init__(**kwargs)
-
-        self.voltage_domain = VoltageDomain(voltage="1.0V")
-        self.clk_domain = SrcClockDomain(
-            clock="1GHz", voltage_domain=Parent.voltage_domain
-        )
-
-        if platform is None:
-            self.realview = VExpress_GEM5_V1()
-        else:
-            self.realview = platform
-
-        if hasattr(self.realview.gic, "cpu_addr"):
-            self.gic_cpu_addr = self.realview.gic.cpu_addr
-
-        self.terminal = Terminal()
-        self.vncserver = VncServer()
-
-        self.iobus = IOXBar()
-        # Device DMA -> MEM
-        self.mem_ranges = self.getMemRanges(int(Addr(mem_size)))
+class ClusterSystem:
+    """
+    Base class providing cpu clusters generation/handling methods to
+    SE/FS systems
+    """

+    def __init__(self, **kwargs):
        self._clusters = []

-    def getMemRanges(self, mem_size):
-        """
-        Define system memory ranges. This depends on the physical
-        memory map provided by the realview platform and by the memory
-        size provided by the user (mem_size argument).
-        The method is iterating over all platform ranges until they cover
-        the entire user's memory requirements.
-        """
-        mem_ranges = []
-        for mem_range in self.realview._mem_regions:
-            size_in_range = min(mem_size, mem_range.size())
-
-            mem_ranges.append(
-                AddrRange(start=mem_range.start, size=size_in_range)
-            )
-
-            mem_size -= size_in_range
-            if mem_size == 0:
-                return mem_ranges
-
-        raise ValueError("memory size too big for platform capabilities")
-
    def numCpuClusters(self):
        return len(self._clusters)

@@ -423,6 +382,80 @@ class BaseSimpleSystem(ArmSystem):
            cluster.connectMemSide(cluster_mem_bus)


+class SimpleSeSystem(System, ClusterSystem):
+    """
+    Example system class for syscall emulation mode
+    """
+
+    # Use a fixed cache line size of 64 bytes
+    cache_line_size = 64
+
+    def __init__(self, **kwargs):
+        System.__init__(self, **kwargs)
+        ClusterSystem.__init__(self, **kwargs)
+        # Create a voltage and clock domain for system components
+        self.voltage_domain = VoltageDomain(voltage="3.3V")
+        self.clk_domain = SrcClockDomain(
+            clock="1GHz", voltage_domain=self.voltage_domain
+        )
+
+        # Create the off-chip memory bus.
+        self.membus = SystemXBar()
+
+    def connect(self):
+        self.system_port = self.membus.cpu_side_ports
+
+
+class BaseSimpleSystem(ArmSystem, ClusterSystem):
+    cache_line_size = 64
+
+    def __init__(self, mem_size, platform, **kwargs):
+        ArmSystem.__init__(self, **kwargs)
+        ClusterSystem.__init__(self, **kwargs)
+
+        self.voltage_domain = VoltageDomain(voltage="1.0V")
+        self.clk_domain = SrcClockDomain(
+            clock="1GHz", voltage_domain=Parent.voltage_domain
+        )
+
+        if platform is None:
+            self.realview = VExpress_GEM5_V1()
+        else:
+            self.realview = platform
+
+        if hasattr(self.realview.gic, "cpu_addr"):
+            self.gic_cpu_addr = self.realview.gic.cpu_addr
+
+        self.terminal = Terminal()
+        self.vncserver = VncServer()
+
+        self.iobus = IOXBar()
+        # Device DMA -> MEM
+        self.mem_ranges = self.getMemRanges(int(Addr(mem_size)))
+
+    def getMemRanges(self, mem_size):
+        """
+        Define system memory ranges. This depends on the physical
+        memory map provided by the realview platform and by the memory
+        size provided by the user (mem_size argument).
+        The method is iterating over all platform ranges until they cover
+        the entire user's memory requirements.
+        """
+        mem_ranges = []
+        for mem_range in self.realview._mem_regions:
+            size_in_range = min(mem_size, mem_range.size())
+
+            mem_ranges.append(
+                AddrRange(start=mem_range.start, size=size_in_range)
+            )
+
+            mem_size -= size_in_range
+            if mem_size == 0:
+                return mem_ranges
+
+        raise ValueError("memory size too big for platform capabilities")
+
+
 class SimpleSystem(BaseSimpleSystem):
    """
    Meant to be used with the classic memory model
--- a/configs/example/arm/etrace_se.py
+++ b/configs/example/arm/etrace_se.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2016-2017, 2022-2023 Arm Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import os
+import m5
+from m5.util import addToPath
+from m5.objects import *
+import argparse
+import shlex
+
+m5.util.addToPath("../..")
+
+from common import ObjectList
+
+import devices
+
+
+def get_processes(cmd):
+    """Interprets commands to run and returns a list of processes"""
+
+    cwd = os.getcwd()
+    multiprocesses = []
+    for idx, c in enumerate(cmd):
+        argv = shlex.split(c)
+
+        process = Process(pid=100 + idx, cwd=cwd, cmd=argv, executable=argv[0])
+        process.gid = os.getgid()
+
+        print("info: %d. command and arguments: %s" % (idx + 1, process.cmd))
+        multiprocesses.append(process)
+
+    return multiprocesses
+
+
+def create(args):
+    """Create and configure the system object."""
+
+    system = devices.SimpleSeSystem(
+        mem_mode="timing",
+    )
+
+    # Add CPUs to the system. A cluster of CPUs typically have
+    # private L1 caches and a shared L2 cache.
+    system.cpu_cluster = devices.ArmCpuCluster(
+        system,
+        args.num_cores,
+        args.cpu_freq,
+        "1.2V",
+        ObjectList.cpu_list.get("O3_ARM_v7a_3_Etrace"),
+        devices.L1I,
+        devices.L1D,
+        devices.L2,
+    )
+
+    # Attach the elastic trace probe listener to every CPU in the cluster
+    for cpu in system.cpu_cluster:
+        cpu.attach_probe_listener(args.inst_trace_file, args.data_trace_file)
+
+    # As elastic trace generation is enabled, make sure the memory system is
+    # minimal so that compute delays do not include memory access latencies.
+    # Configure the compulsory L1 caches for the O3CPU, do not configure
+    # any more caches.
+    system.addCaches(True, last_cache_level=1)
+
+    # For elastic trace, over-riding Simple Memory latency to 1ns."
+    system.memory = SimpleMemory(
+        range=AddrRange(start=0, size=args.mem_size),
+        latency="1ns",
+        port=system.membus.mem_side_ports,
+    )
+
+    # Parse the command line and get a list of Processes instances
+    # that we can pass to gem5.
+    processes = get_processes(args.commands_to_run)
+    if len(processes) != args.num_cores:
+        print(
+            "Error: Cannot map %d command(s) onto %d CPU(s)"
+            % (len(processes), args.num_cores)
+        )
+        sys.exit(1)
+
+    system.workload = SEWorkload.init_compatible(processes[0].executable)
+
+    # Assign one workload to each CPU
+    for cpu, workload in zip(system.cpu_cluster.cpus, processes):
+        cpu.workload = workload
+
+    return system
+
+
+def main():
+    parser = argparse.ArgumentParser(epilog=__doc__)
+
+    parser.add_argument(
+        "commands_to_run",
+        metavar="command(s)",
+        nargs="+",
+        help="Command(s) to run",
+    )
+    parser.add_argument(
+        "--inst-trace-file",
+        action="store",
+        type=str,
+        help="""Instruction fetch trace file input to
+                Elastic Trace probe in a capture simulation and
+                Trace CPU in a replay simulation""",
+        default="fetchtrace.proto.gz",
+    )
+    parser.add_argument(
+        "--data-trace-file",
+        action="store",
+        type=str,
+        help="""Data dependency trace file input to
+                Elastic Trace probe in a capture simulation and
+                Trace CPU in a replay simulation""",
+        default="deptrace.proto.gz",
+    )
+    parser.add_argument("--cpu-freq", type=str, default="4GHz")
+    parser.add_argument(
+        "--num-cores", type=int, default=1, help="Number of CPU cores"
+    )
+    parser.add_argument(
+        "--mem-size",
+        action="store",
+        type=str,
+        default="2GB",
+        help="Specify the physical memory size",
+    )
+
+    args = parser.parse_args()
+
+    # Create a single root node for gem5's object hierarchy. There can
+    # only exist one root node in the simulator at any given
+    # time. Tell gem5 that we want to use syscall emulation mode
+    # instead of full system mode.
+    root = Root(full_system=False)
+
+    # Populate the root node with a system. A system corresponds to a
+    # single node with shared memory.
+    root.system = create(args)
+
+    # Instantiate the C++ object hierarchy. After this point,
+    # SimObjects can't be instantiated anymore.
+    m5.instantiate()
+
+    # Start the simulator. This gives control to the C++ world and
+    # starts the simulator. The returned event tells the simulation
+    # script why the simulator exited.
+    event = m5.simulate()
+
+    # Print the reason for the simulation exit. Some exit codes are
+    # requests for service (e.g., checkpoints) from the simulation
+    # script. We'll just ignore them here and exit.
+    print(f"{event.getCause()} ({event.getCode()}) @ {m5.curTick()}")
+
+
+if __name__ == "__m5_main__":
+    main()
--- a/configs/example/arm/starter_se.py
+++ b/configs/example/arm/starter_se.py
@@ -64,72 +64,6 @@ cpu_types = {
 }


-class SimpleSeSystem(System):
-    """
-    Example system class for syscall emulation mode
-    """
-
-    # Use a fixed cache line size of 64 bytes
-    cache_line_size = 64
-
-    def __init__(self, args, **kwargs):
-        super(SimpleSeSystem, self).__init__(**kwargs)
-
-        # Setup book keeping to be able to use CpuClusters from the
-        # devices module.
-        self._clusters = []
-        self._num_cpus = 0
-
-        # Create a voltage and clock domain for system components
-        self.voltage_domain = VoltageDomain(voltage="3.3V")
-        self.clk_domain = SrcClockDomain(
-            clock="1GHz", voltage_domain=self.voltage_domain
-        )
-
-        # Create the off-chip memory bus.
-        self.membus = SystemXBar()
-
-        # Wire up the system port that gem5 uses to load the kernel
-        # and to perform debug accesses.
-        self.system_port = self.membus.cpu_side_ports
-
-        # Add CPUs to the system. A cluster of CPUs typically have
-        # private L1 caches and a shared L2 cache.
-        self.cpu_cluster = devices.ArmCpuCluster(
-            self,
-            args.num_cores,
-            args.cpu_freq,
-            "1.2V",
-            *cpu_types[args.cpu],
-            tarmac_gen=args.tarmac_gen,
-            tarmac_dest=args.tarmac_dest,
-        )
-
-        # Create a cache hierarchy (unless we are simulating a
-        # functional CPU in atomic memory mode) for the CPU cluster
-        # and connect it to the shared memory bus.
-        if self.cpu_cluster.memory_mode() == "timing":
-            self.cpu_cluster.addL1()
-            self.cpu_cluster.addL2(self.cpu_cluster.clk_domain)
-        self.cpu_cluster.connectMemSide(self.membus)
-
-        # Tell gem5 about the memory mode used by the CPUs we are
-        # simulating.
-        self.mem_mode = self.cpu_cluster.memory_mode()
-
-    def numCpuClusters(self):
-        return len(self._clusters)
-
-    def addCpuCluster(self, cpu_cluster):
-        assert cpu_cluster not in self._clusters
-        assert len(cpu_cluster) > 0
-        self._clusters.append(cpu_cluster)
-        self._num_cpus += len(cpu_cluster)
-
-    def numCpus(self):
-        return self._num_cpus
-
-
 def get_processes(cmd):
    """Interprets commands to run and returns a list of processes"""

@@ -150,7 +84,31 @@ def get_processes(cmd):
 def create(args):
    """Create and configure the system object."""

-    system = SimpleSeSystem(args)
+    cpu_class = cpu_types[args.cpu][0]
+    mem_mode = cpu_class.memory_mode()
+    # Only simulate caches when using a timing CPU (e.g., the HPI model)
+    want_caches = True if mem_mode == "timing" else False
+
+    system = devices.SimpleSeSystem(
+        mem_mode=mem_mode,
+    )
+
+    # Add CPUs to the system. A cluster of CPUs typically have
+    # private L1 caches and a shared L2 cache.
+    system.cpu_cluster = devices.ArmCpuCluster(
+        system,
+        args.num_cores,
+        args.cpu_freq,
+        "1.2V",
+        *cpu_types[args.cpu],
+        tarmac_gen=args.tarmac_gen,
+        tarmac_dest=args.tarmac_dest,
+    )
+
+    # Create a cache hierarchy for the cluster. We are assuming that
+    # clusters have core-private L1 caches and an L2 that's shared
+    # within the cluster.
+    system.addCaches(want_caches, last_cache_level=2)

    # Tell components about the expected physical memory ranges. This
    # is, for example, used by the MemConfig helper to determine where
@@ -160,6 +118,9 @@ def create(args):
    # Configure the off-chip memory system.
    MemConfig.config_mem(args, system)

+    # Wire up the system's memory system
+    system.connect()
+
    # Parse the command line and get a list of Processes instances
    # that we can pass to gem5.
    processes = get_processes(args.commands_to_run)
--- a/configs/example/gem5_library/checkpoints/simpoints-se-checkpoint.py
+++ b/configs/example/gem5_library/checkpoints/simpoints-se-checkpoint.py
@@ -128,7 +128,6 @@ board.set_se_simpoint_workload(
 )

 dir = Path(args.checkpoint_path)
-dir.mkdir(exist_ok=True)

 simulator = Simulator(
    board=board,
--- a/src/dev/amdgpu/SConscript
+++ b/src/dev/amdgpu/SConscript
@@ -39,6 +39,7 @@ SimObject('AMDGPU.py', sim_objects=['AMDGPUDevice', 'AMDGPUInterruptHandler',
                                    tags='x86 isa')

 Source('amdgpu_device.cc', tags='x86 isa')
+Source('amdgpu_gfx.cc', tags='x86 isa')
 Source('amdgpu_nbio.cc', tags='x86 isa')
 Source('amdgpu_vm.cc', tags='x86 isa')
 Source('interrupt_handler.cc', tags='x86 isa')
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -379,6 +379,9 @@ AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset)
      case GRBM_BASE:
        gpuvm.readMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
        break;
+      case GFX_BASE:
+        gfx.readMMIO(pkt, aperture_offset);
+        break;
      case MMHUB_BASE:
        gpuvm.readMMIO(pkt, aperture_offset >> MMHUB_OFFSET_SHIFT);
        break;
@@ -507,6 +510,9 @@ AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
      case NBIO_BASE:
        nbio.writeMMIO(pkt, aperture_offset);
        break;
+      case GFX_BASE:
+        gfx.writeMMIO(pkt, aperture_offset);
+        break;
      default:
        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for %#x\n", offset);
        break;
--- a/src/dev/amdgpu/amdgpu_device.hh
+++ b/src/dev/amdgpu/amdgpu_device.hh
@@ -36,6 +36,7 @@

 #include "base/bitunion.hh"
 #include "dev/amdgpu/amdgpu_defines.hh"
+#include "dev/amdgpu/amdgpu_gfx.hh"
 #include "dev/amdgpu/amdgpu_nbio.hh"
 #include "dev/amdgpu/amdgpu_vm.hh"
 #include "dev/amdgpu/memory_manager.hh"
@@ -109,6 +110,7 @@ class AMDGPUDevice : public PciDevice
     * Blocks of the GPU
     */
    AMDGPUNbio nbio;
+    AMDGPUGfx gfx;
    AMDGPUMemoryManager *gpuMemMgr;
    AMDGPUInterruptHandler *deviceIH;
    AMDGPUVM gpuvm;
--- a/src/dev/amdgpu/amdgpu_gfx.cc
+++ b/src/dev/amdgpu/amdgpu_gfx.cc
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "dev/amdgpu/amdgpu_gfx.hh"
+
+#include "mem/packet_access.hh"
+#include "sim/core.hh"
+
+namespace gem5
+{
+
+void
+AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset)
+{
+    switch (offset) {
+      case AMDGPU_MM_RLC_GPU_CLOCK_COUNT_LSB:
+        pkt->setLE<uint32_t>(captured_clock_count);
+        break;
+      case AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB:
+        pkt->setLE<uint32_t>(captured_clock_count >> 32);
+        break;
+      default:
+        break;
+    }
+}
+
+void
+AMDGPUGfx::writeMMIO(PacketPtr pkt, Addr offset)
+{
+    switch (offset) {
+      case AMDGPU_MM_RLC_CAPTURE_GPU_CLOCK_COUNT:
+        // Use gem5 Ticks in nanoseconds are the counter. The first capture
+        // is expected to return zero.
+        if (captured_clock_count == 1) {
+          captured_clock_count = 0;
+        } else {
+          captured_clock_count = curTick() / sim_clock::as_int::ns;
+        }
+        break;
+      default:
+        break;
+    }
+}
+
+} // namespace gem5
--- a/src/dev/amdgpu/amdgpu_gfx.hh
+++ b/src/dev/amdgpu/amdgpu_gfx.hh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DEV_AMDGPU_AMDGPU_GFX_HH__
+#define __DEV_AMDGPU_AMDGPU_GFX_HH__
+
+#include "base/types.hh"
+#include "mem/packet.hh"
+
+/**
+ * MMIO offsets for GFX. This class handles MMIO reads/writes to the GFX_BASE
+ * aperture which are generally read/written by the gfx driver source here:
+ *
+ *      drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+ * https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/blob/master/
+ *      drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+ *
+ * The MMIO addresses in the file are dword addresses. Here they are converted
+ * to byte addresses so gem5 does not need to shift the values.
+ */
+
+// Registers used to read GPU clock count used in profiling
+#define AMDGPU_MM_RLC_GPU_CLOCK_COUNT_LSB                 0x13090
+#define AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB                 0x13094
+#define AMDGPU_MM_RLC_CAPTURE_GPU_CLOCK_COUNT             0x13098
+
+namespace gem5
+{
+
+class AMDGPUGfx
+{
+  public:
+    AMDGPUGfx() { }
+
+    void readMMIO(PacketPtr pkt, Addr offset);
+    void writeMMIO(PacketPtr pkt, Addr offset);
+
+  private:
+    /*
+     * GPU clock count at the time capture MMIO is received.
+     */
+    uint64_t captured_clock_count = 1;
+};
+
+} // namespace gem5
+
+#endif // __DEV_AMDGPU_AMDGPU_GFX_HH__
--- a/src/dev/hsa/hsa_signal.hh
+++ b/src/dev/hsa/hsa_signal.hh
@@ -69,6 +69,12 @@ typedef struct amd_signal_s
  uint32_t reserved3[2];
 } amd_signal_t;

+typedef struct
+{
+  uint64_t start_ts;
+  uint64_t end_ts;
+} amd_event_t;
+
 } // namespace gem5

 #endif // DEV_HSA_HSA_SIGNAL_H
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -248,6 +248,10 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,

    initABI(task);
    ++dynamic_task_id;
+
+    // The driver expects the start time to be in ns
+    Tick start_ts = curTick() / sim_clock::as_int::ns;
+    dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
 }

 void
@@ -280,16 +284,6 @@ GPUCommandProcessor::sendCompletionSignal(Addr signal_handle)
 void
 GPUCommandProcessor::updateHsaSignalAsync(Addr signal_handle, int64_t diff)
 {
-    Addr value_addr = getHsaSignalValueAddr(signal_handle);
-
-    uint64_t *signalValue = new uint64_t;
-    auto cb = new DmaVirtCallback<uint64_t>(
-        [ = ] (const uint64_t &)
-            { updateHsaSignalData(value_addr, diff, signalValue); });
-    dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue);
-    DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n",
-            value_addr);
-
    Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
    uint64_t *mailboxValue = new uint64_t;
    auto cb2 = new DmaVirtCallback<uint64_t>(
@@ -300,20 +294,6 @@ GPUCommandProcessor::updateHsaSignalAsync(Addr signal_handle, int64_t diff)
            mailbox_addr);
 }

-void
-GPUCommandProcessor::updateHsaSignalData(Addr value_addr, int64_t diff,
-                                         uint64_t *prev_value)
-{
-    // Reuse the value allocated for the read
-    DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n",
-            *prev_value, *prev_value + diff);
-    *prev_value += diff;
-    auto cb = new DmaVirtCallback<uint64_t>(
-        [ = ] (const uint64_t &)
-            { updateHsaSignalDone(prev_value); });
-    dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value);
-}
-
 void
 GPUCommandProcessor::updateHsaMailboxData(Addr signal_handle,
                                          uint64_t *mailbox_value)
@@ -331,6 +311,20 @@ GPUCommandProcessor::updateHsaMailboxData(Addr signal_handle,
        dmaReadVirt(event_addr, sizeof(uint64_t), cb, (void *)mailbox_value);
    } else {
        delete mailbox_value;
+
+        Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts);
+
+        amd_event_t *event_ts = new amd_event_t;
+        event_ts->start_ts = dispatchStartTime[signal_handle];
+        event_ts->end_ts = curTick() / sim_clock::as_int::ns;
+        auto cb = new DmaVirtCallback<uint64_t>(
+            [ = ] (const uint64_t &)
+                { updateHsaEventTs(signal_handle, event_ts); });
+        dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb, (void *)event_ts);
+        DPRINTF(GPUCommandProc, "updateHsaMailboxData reading timestamp addr "
+                "%lx\n", ts_addr);
+
+        dispatchStartTime.erase(signal_handle);
    }
 }

@@ -346,6 +340,52 @@ GPUCommandProcessor::updateHsaEventData(Addr signal_handle,
        [ = ] (const uint64_t &)
            { updateHsaSignalDone(event_value); }, *event_value);
    dmaWriteVirt(mailbox_addr, sizeof(uint64_t), cb, &cb->dmaBuffer, 0);
+
+    Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts);
+
+    amd_event_t *event_ts = new amd_event_t;
+    event_ts->start_ts = dispatchStartTime[signal_handle];
+    event_ts->end_ts = curTick() / sim_clock::as_int::ns;
+    auto cb2 = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaEventTs(signal_handle, event_ts); });
+    dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb2, (void *)event_ts);
+    DPRINTF(GPUCommandProc, "updateHsaEventData reading timestamp addr %lx\n",
+            ts_addr);
+
+    dispatchStartTime.erase(signal_handle);
+}
+
+void
+GPUCommandProcessor::updateHsaEventTs(Addr signal_handle,
+                                      amd_event_t *ts)
+{
+    delete ts;
+
+    Addr value_addr = getHsaSignalValueAddr(signal_handle);
+    int64_t diff = -1;
+
+    uint64_t *signalValue = new uint64_t;
+    auto cb = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaSignalData(value_addr, diff, signalValue); });
+    dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue);
+    DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n",
+            value_addr);
+}
+
+void
+GPUCommandProcessor::updateHsaSignalData(Addr value_addr, int64_t diff,
+                                         uint64_t *prev_value)
+{
+    // Reuse the value allocated for the read
+    DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n",
+            *prev_value, *prev_value + diff);
+    *prev_value += diff;
+    auto cb = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaSignalDone(prev_value); });
+    dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value);
 }

 void
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -117,6 +117,7 @@ class GPUCommandProcessor : public DmaVirtDevice
    void updateHsaSignalDone(uint64_t *signal_value);
    void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value);
    void updateHsaEventData(Addr signal_handle, uint64_t *event_value);
+    void updateHsaEventTs(Addr signal_handle, amd_event_t *event_value);

    uint64_t functionalReadHsaSignal(Addr signal_handle);

@@ -148,6 +149,9 @@ class GPUCommandProcessor : public DmaVirtDevice
    HSAPacketProcessor *hsaPP;
    TranslationGenPtr translate(Addr vaddr, Addr size) override;

+    // Keep track of start times for task dispatches.
+    std::unordered_map<Addr, Tick> dispatchStartTime;
+
    /**
     * Perform a DMA read of the read_dispatch_id_field_base_byte_offset
     * field, which follows directly after the read_dispatch_id (the read
--- a/util/dockerfiles/docker-compose.yaml
+++ b/util/dockerfiles/docker-compose.yaml
@@ -1,3 +1,4 @@
+---
 version: '2'

 services: