From 6a4b2bb0965a28d1428a60b867318e23511dd168 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Mon, 11 Sep 2023 09:22:26 -0500
Subject: [PATCH 1/2] dev-hsa,gpu-compute: Add timestamps to AMD HSA signals

The AMD specific HSA signal contains start/end timestamps for dispatch
packet completion signals. These are current always zero. These
timestamp values are used for profiling in the ROCr runtime.
Unfortunately, the GpuAgent::TranslateTime method in ROCr does not check
for zero values before dividing, causing applications that use profiling
to crash with SIGFPE. Profiling is used via hipEvents in the HACC
application, so these should be supported in gem5.

In order to handle writing the timestamp values, we need to DMA the
values to memory before writing the completion signal. This changes the
flow of the async completion signal write to be (1) read mailbox pointer
(2) if valid, write the mailbox data, other skip to 4 (3) write mailbox
data if pointer is valid (4) write timestamp values (5) write completion
signal. The application will process the timestamp data as soon as the
completion signal is received, so we need to ordering to ensure the DMA
for timestamps was completed.

HACC now runs to completion on GPUFS and has the same output was
hardware.

Change-Id: I09877cdff901d1402140f2c3bafea7605fa6554e
---
 src/dev/hsa/hsa_signal.hh                |  6 ++
 src/gpu-compute/gpu_command_processor.cc | 88 +++++++++++++++++-------
 src/gpu-compute/gpu_command_processor.hh |  4 ++
 3 files changed, 74 insertions(+), 24 deletions(-)

diff --git a/src/dev/hsa/hsa_signal.hh b/src/dev/hsa/hsa_signal.hh
index 6acbcb7e1b..7d1f316f04 100644
--- a/src/dev/hsa/hsa_signal.hh
+++ b/src/dev/hsa/hsa_signal.hh
@@ -69,6 +69,12 @@ typedef struct amd_signal_s
   uint32_t reserved3[2];
 } amd_signal_t;
 
+typedef struct
+{
+  uint64_t start_ts;
+  uint64_t end_ts;
+} amd_event_t;
+
 } // namespace gem5
 
 #endif // DEV_HSA_HSA_SIGNAL_H
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index db69ad5cbd..ecc5f1d98b 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -248,6 +248,10 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
 
     initABI(task);
     ++dynamic_task_id;
+
+    // The driver expects the start time to be in ns
+    Tick start_ts = curTick() / sim_clock::as_int::ns;
+    dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
 }
 
 void
@@ -280,16 +284,6 @@ GPUCommandProcessor::sendCompletionSignal(Addr signal_handle)
 void
 GPUCommandProcessor::updateHsaSignalAsync(Addr signal_handle, int64_t diff)
 {
-    Addr value_addr = getHsaSignalValueAddr(signal_handle);
-
-    uint64_t *signalValue = new uint64_t;
-    auto cb = new DmaVirtCallback<uint64_t>(
-        [ = ] (const uint64_t &)
-            { updateHsaSignalData(value_addr, diff, signalValue); });
-    dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue);
-    DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n",
-            value_addr);
-
     Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
     uint64_t *mailboxValue = new uint64_t;
     auto cb2 = new DmaVirtCallback<uint64_t>(
@@ -300,20 +294,6 @@ GPUCommandProcessor::updateHsaSignalAsync(Addr signal_handle, int64_t diff)
             mailbox_addr);
 }
 
-void
-GPUCommandProcessor::updateHsaSignalData(Addr value_addr, int64_t diff,
-                                         uint64_t *prev_value)
-{
-    // Reuse the value allocated for the read
-    DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n",
-            *prev_value, *prev_value + diff);
-    *prev_value += diff;
-    auto cb = new DmaVirtCallback<uint64_t>(
-        [ = ] (const uint64_t &)
-            { updateHsaSignalDone(prev_value); });
-    dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value);
-}
-
 void
 GPUCommandProcessor::updateHsaMailboxData(Addr signal_handle,
                                           uint64_t *mailbox_value)
@@ -331,6 +311,20 @@ GPUCommandProcessor::updateHsaMailboxData(Addr signal_handle,
         dmaReadVirt(event_addr, sizeof(uint64_t), cb, (void *)mailbox_value);
     } else {
         delete mailbox_value;
+
+        Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts);
+
+        amd_event_t *event_ts = new amd_event_t;
+        event_ts->start_ts = dispatchStartTime[signal_handle];
+        event_ts->end_ts = curTick() / sim_clock::as_int::ns;
+        auto cb = new DmaVirtCallback<uint64_t>(
+            [ = ] (const uint64_t &)
+                { updateHsaEventTs(signal_handle, event_ts); });
+        dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb, (void *)event_ts);
+        DPRINTF(GPUCommandProc, "updateHsaMailboxData reading timestamp addr "
+                "%lx\n", ts_addr);
+
+        dispatchStartTime.erase(signal_handle);
     }
 }
 
@@ -346,6 +340,52 @@ GPUCommandProcessor::updateHsaEventData(Addr signal_handle,
         [ = ] (const uint64_t &)
             { updateHsaSignalDone(event_value); }, *event_value);
     dmaWriteVirt(mailbox_addr, sizeof(uint64_t), cb, &cb->dmaBuffer, 0);
+
+    Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts);
+
+    amd_event_t *event_ts = new amd_event_t;
+    event_ts->start_ts = dispatchStartTime[signal_handle];
+    event_ts->end_ts = curTick() / sim_clock::as_int::ns;
+    auto cb2 = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaEventTs(signal_handle, event_ts); });
+    dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb2, (void *)event_ts);
+    DPRINTF(GPUCommandProc, "updateHsaEventData reading timestamp addr %lx\n",
+            ts_addr);
+
+    dispatchStartTime.erase(signal_handle);
+}
+
+void
+GPUCommandProcessor::updateHsaEventTs(Addr signal_handle,
+                                      amd_event_t *ts)
+{
+    delete ts;
+
+    Addr value_addr = getHsaSignalValueAddr(signal_handle);
+    int64_t diff = -1;
+
+    uint64_t *signalValue = new uint64_t;
+    auto cb = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaSignalData(value_addr, diff, signalValue); });
+    dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue);
+    DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n",
+            value_addr);
+}
+
+void
+GPUCommandProcessor::updateHsaSignalData(Addr value_addr, int64_t diff,
+                                         uint64_t *prev_value)
+{
+    // Reuse the value allocated for the read
+    DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n",
+            *prev_value, *prev_value + diff);
+    *prev_value += diff;
+    auto cb = new DmaVirtCallback<uint64_t>(
+        [ = ] (const uint64_t &)
+            { updateHsaSignalDone(prev_value); });
+    dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value);
 }
 
 void
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh
index 10407b9f93..f6783834eb 100644
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -117,6 +117,7 @@ class GPUCommandProcessor : public DmaVirtDevice
     void updateHsaSignalDone(uint64_t *signal_value);
     void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value);
     void updateHsaEventData(Addr signal_handle, uint64_t *event_value);
+    void updateHsaEventTs(Addr signal_handle, amd_event_t *event_value);
 
     uint64_t functionalReadHsaSignal(Addr signal_handle);
 
@@ -148,6 +149,9 @@ class GPUCommandProcessor : public DmaVirtDevice
     HSAPacketProcessor *hsaPP;
     TranslationGenPtr translate(Addr vaddr, Addr size) override;
 
+    // Keep track of start times for task dispatches.
+    std::unordered_map<Addr, Tick> dispatchStartTime;
+
     /**
      * Perform a DMA read of the read_dispatch_id_field_base_byte_offset
      * field, which follows directly after the read_dispatch_id (the read

From 75a7f30dfb9ada69e8fdb35c54f0c3dd7d94164d Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 6 Oct 2023 13:02:31 -0500
Subject: [PATCH 2/2] dev-amdgpu: Implement GPU clock MMIOs

The ROCr runtime uses a combination of HSA signal timestamps and
hardware MMIOs to calculate profiling times. At the beginning of an
application a timestamp is read from the GPU using MMIOs. The clock
MMIOs reside in the GFX MMIO region, so a new AMDGPUGfx class is added
to handle these MMIOs.

The timestamp value is expected to be in nanoseconds, so we simply use
the gem5 tick converted to ns.

Change-Id: I7d1cba40d5042a7f7a81fd4d132402dc11b71bd4
---
 src/dev/amdgpu/SConscript       |  1 +
 src/dev/amdgpu/amdgpu_device.cc |  6 +++
 src/dev/amdgpu/amdgpu_device.hh |  2 +
 src/dev/amdgpu/amdgpu_gfx.cc    | 73 ++++++++++++++++++++++++++++++++
 src/dev/amdgpu/amdgpu_gfx.hh    | 75 +++++++++++++++++++++++++++++++++
 5 files changed, 157 insertions(+)
 create mode 100644 src/dev/amdgpu/amdgpu_gfx.cc
 create mode 100644 src/dev/amdgpu/amdgpu_gfx.hh

diff --git a/src/dev/amdgpu/SConscript b/src/dev/amdgpu/SConscript
index 428d1c56bc..b8ba454d48 100644
--- a/src/dev/amdgpu/SConscript
+++ b/src/dev/amdgpu/SConscript
@@ -39,6 +39,7 @@ SimObject('AMDGPU.py', sim_objects=['AMDGPUDevice', 'AMDGPUInterruptHandler',
                                     tags='x86 isa')
 
 Source('amdgpu_device.cc', tags='x86 isa')
+Source('amdgpu_gfx.cc', tags='x86 isa')
 Source('amdgpu_nbio.cc', tags='x86 isa')
 Source('amdgpu_vm.cc', tags='x86 isa')
 Source('interrupt_handler.cc', tags='x86 isa')
diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index 5cc8df424f..1b81c4d0b2 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -379,6 +379,9 @@ AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset)
       case GRBM_BASE:
         gpuvm.readMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
         break;
+      case GFX_BASE:
+        gfx.readMMIO(pkt, aperture_offset);
+        break;
       case MMHUB_BASE:
         gpuvm.readMMIO(pkt, aperture_offset >> MMHUB_OFFSET_SHIFT);
         break;
@@ -507,6 +510,9 @@ AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
       case NBIO_BASE:
         nbio.writeMMIO(pkt, aperture_offset);
         break;
+      case GFX_BASE:
+        gfx.writeMMIO(pkt, aperture_offset);
+        break;
       default:
         DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for %#x\n", offset);
         break;
diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh
index 56ed2f4fa8..7f69ec19f6 100644
--- a/src/dev/amdgpu/amdgpu_device.hh
+++ b/src/dev/amdgpu/amdgpu_device.hh
@@ -36,6 +36,7 @@
 
 #include "base/bitunion.hh"
 #include "dev/amdgpu/amdgpu_defines.hh"
+#include "dev/amdgpu/amdgpu_gfx.hh"
 #include "dev/amdgpu/amdgpu_nbio.hh"
 #include "dev/amdgpu/amdgpu_vm.hh"
 #include "dev/amdgpu/memory_manager.hh"
@@ -109,6 +110,7 @@ class AMDGPUDevice : public PciDevice
      * Blocks of the GPU
      */
     AMDGPUNbio nbio;
+    AMDGPUGfx gfx;
     AMDGPUMemoryManager *gpuMemMgr;
     AMDGPUInterruptHandler *deviceIH;
     AMDGPUVM gpuvm;
diff --git a/src/dev/amdgpu/amdgpu_gfx.cc b/src/dev/amdgpu/amdgpu_gfx.cc
new file mode 100644
index 0000000000..3d5b274b86
--- /dev/null
+++ b/src/dev/amdgpu/amdgpu_gfx.cc
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "dev/amdgpu/amdgpu_gfx.hh"
+
+#include "mem/packet_access.hh"
+#include "sim/core.hh"
+
+namespace gem5
+{
+
+void
+AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset)
+{
+    switch (offset) {
+      case AMDGPU_MM_RLC_GPU_CLOCK_COUNT_LSB:
+        pkt->setLE<uint32_t>(captured_clock_count);
+        break;
+      case AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB:
+        pkt->setLE<uint32_t>(captured_clock_count >> 32);
+        break;
+      default:
+        break;
+    }
+}
+
+void
+AMDGPUGfx::writeMMIO(PacketPtr pkt, Addr offset)
+{
+    switch (offset) {
+      case AMDGPU_MM_RLC_CAPTURE_GPU_CLOCK_COUNT:
+        // Use gem5 Ticks in nanoseconds are the counter. The first capture
+        // is expected to return zero.
+        if (captured_clock_count == 1) {
+          captured_clock_count = 0;
+        } else {
+          captured_clock_count = curTick() / sim_clock::as_int::ns;
+        }
+        break;
+      default:
+        break;
+    }
+}
+
+} // namespace gem5
diff --git a/src/dev/amdgpu/amdgpu_gfx.hh b/src/dev/amdgpu/amdgpu_gfx.hh
new file mode 100644
index 0000000000..c32b8624cf
--- /dev/null
+++ b/src/dev/amdgpu/amdgpu_gfx.hh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DEV_AMDGPU_AMDGPU_GFX_HH__
+#define __DEV_AMDGPU_AMDGPU_GFX_HH__
+
+#include "base/types.hh"
+#include "mem/packet.hh"
+
+/**
+ * MMIO offsets for GFX. This class handles MMIO reads/writes to the GFX_BASE
+ * aperture which are generally read/written by the gfx driver source here:
+ *
+ *      drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+ * https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/blob/master/
+ *      drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+ *
+ * The MMIO addresses in the file are dword addresses. Here they are converted
+ * to byte addresses so gem5 does not need to shift the values.
+ */
+
+// Registers used to read GPU clock count used in profiling
+#define AMDGPU_MM_RLC_GPU_CLOCK_COUNT_LSB                 0x13090
+#define AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB                 0x13094
+#define AMDGPU_MM_RLC_CAPTURE_GPU_CLOCK_COUNT             0x13098
+
+namespace gem5
+{
+
+class AMDGPUGfx
+{
+  public:
+    AMDGPUGfx() { }
+
+    void readMMIO(PacketPtr pkt, Addr offset);
+    void writeMMIO(PacketPtr pkt, Addr offset);
+
+  private:
+    /*
+     * GPU clock count at the time capture MMIO is received.
+     */
+    uint64_t captured_clock_count = 1;
+};
+
+} // namespace gem5
+
+#endif // __DEV_AMDGPU_AMDGPU_GFX_HH__