From 8be5ce6fc96847a2910bfff8c966586dc8a4dddb Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 15 May 2024 10:49:05 -0700
Subject: [PATCH] dev-amdgpu,configs,gpu-compute: Add gfx942 version

This is the version for MI300. For the most part, it is the same as
MI200 with the exception of architected flat scratch (not yet
implemented in gem5) and therefore a new version enum is required.

Change-Id: Id18cd7b57c4eebd467c010a3f61e3117beb8d58a
---
 configs/example/gpufs/runfs.py         |  6 +++---
 configs/example/gpufs/system/amdgpu.py |  6 +++++-
 configs/example/gpufs/system/system.py |  2 +-
 src/dev/amdgpu/amdgpu_device.cc        |  9 ++++++++-
 src/dev/amdgpu/pm4_defines.hh          |  4 ++--
 src/dev/amdgpu/pm4_packet_processor.cc | 17 +++++++++--------
 src/dev/amdgpu/pm4_packet_processor.hh |  4 ++--
 src/gpu-compute/GPU.py                 |  2 +-
 src/gpu-compute/hsa_queue_entry.hh     | 10 ++++++----
 src/gpu-compute/wavefront.cc           |  3 ++-
 10 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index fed155bc44..866fa89822 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -134,9 +134,9 @@ def addRunFSOptions(parser):
     parser.add_argument(
         "--gpu-device",
         default="Vega10",
-        choices=["Vega10", "MI100", "MI200"],
-        help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or "
-        "MI200 (gfx90a)",
+        choices=["Vega10", "MI100", "MI200", "MI300X"],
+        help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), MI200 "
+        "(gfx90a), or MI300X (gfx942).",
     )
 
     parser.add_argument(
diff --git a/configs/example/gpufs/system/amdgpu.py b/configs/example/gpufs/system/amdgpu.py
index 0813759e2a..55937cd255 100644
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -191,10 +191,14 @@ def connectGPU(system, args):
         system.pc.south_bridge.gpu.DeviceID = 0x740F
         system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
         system.pc.south_bridge.gpu.SubsystemID = 0x0C34
+    elif args.gpu_device == "MI300X":
+        system.pc.south_bridge.gpu.DeviceID = 0x740F
+        system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
+        system.pc.south_bridge.gpu.SubsystemID = 0x0C34
     elif args.gpu_device == "Vega10":
         system.pc.south_bridge.gpu.DeviceID = 0x6863
     else:
-        panic(f"Unknown GPU device: {args.gpu_device}")
+        m5.util.panic(f"Unknown GPU device: {args.gpu_device}")
 
     # Use the gem5 default of 0x280 OR'd  with 0x10 which tells Linux there is
     # a PCI capabilities list to travse.
diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py
index 7c596f0ccf..1322650964 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -161,7 +161,7 @@ def makeGpuFSSystem(args):
             0x7D000,
         ]
         sdma_sizes = [0x1000] * 8
-    elif args.gpu_device == "MI200":
+    elif args.gpu_device == "MI200" or args.gpu_device == "MI300X":
         num_sdmas = 5
         sdma_bases = [
             0x4980,
diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index 6bb5f9c2c5..b3a91830fe 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -86,6 +86,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
         gfx_version = GfxVersion::gfx908;
     } else if (p.device_name == "MI200") {
         gfx_version = GfxVersion::gfx90a;
+    } else if (p.device_name == "MI300X") {
+        gfx_version = GfxVersion::gfx942;
     } else {
         panic("Unknown GPU device %s\n", p.device_name);
     }
@@ -124,7 +126,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
         sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
         sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
         sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});
-    } else if (p.device_name == "MI100" || p.device_name == "MI200") {
+    } else if (p.device_name == "MI100" || p.device_name == "MI200"
+            || p.device_name == "MI300X") {
         sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
         sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
         sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
@@ -195,6 +198,10 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
         setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
         setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
         setRegVal(MI200_MEM_SIZE_REG, mem_size);
+    } else if (p.device_name == "MI300X") {
+        setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
+        setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
+        setRegVal(MI200_MEM_SIZE_REG, mem_size);
     } else {
         panic("Unknown GPU device %s\n", p.device_name);
     }
diff --git a/src/dev/amdgpu/pm4_defines.hh b/src/dev/amdgpu/pm4_defines.hh
index a303f8ef84..d00dc3730d 100644
--- a/src/dev/amdgpu/pm4_defines.hh
+++ b/src/dev/amdgpu/pm4_defines.hh
@@ -328,8 +328,8 @@ typedef struct GEM5_PACKED
         };
         uint64_t completionSignal;
     };
-}  PM4MapProcessMI200;
-static_assert(sizeof(PM4MapProcessMI200) == 80);
+}  PM4MapProcessV2;
+static_assert(sizeof(PM4MapProcessV2) == 80);
 
 typedef struct GEM5_PACKED
 {
diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index a921942678..9a8ba13914 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -290,18 +290,19 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
                     dmaBuffer);
         } break;
       case IT_MAP_PROCESS: {
-        if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) {
-            dmaBuffer = new PM4MapProcessMI200();
+        if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a ||
+            gpuDevice->getGfxVersion() == GfxVersion::gfx942) {
+            dmaBuffer = new PM4MapProcessV2();
             cb = new DmaVirtCallback<uint64_t>(
                 [ = ] (const uint64_t &)
-                    { mapProcessGfx90a(q, (PM4MapProcessMI200 *)dmaBuffer); });
-            dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200),
+                    { mapProcessV2(q, (PM4MapProcessV2 *)dmaBuffer); });
+            dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessV2),
                         cb, dmaBuffer);
         } else {
             dmaBuffer = new PM4MapProcess();
             cb = new DmaVirtCallback<uint64_t>(
                 [ = ] (const uint64_t &)
-                    { mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); });
+                    { mapProcessV1(q, (PM4MapProcess *)dmaBuffer); });
             dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
                         dmaBuffer);
         }
@@ -701,7 +702,7 @@ PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase,
 }
 
 void
-PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
+PM4PacketProcessor::mapProcessV1(PM4Queue *q, PM4MapProcess *pkt)
 {
     q->incRptr(sizeof(PM4MapProcess));
 
@@ -716,9 +717,9 @@ PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
 }
 
 void
-PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt)
+PM4PacketProcessor::mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt)
 {
-    q->incRptr(sizeof(PM4MapProcessMI200));
+    q->incRptr(sizeof(PM4MapProcessV2));
 
     DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
             "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh
index 82c3c2716f..71271415fd 100644
--- a/src/dev/amdgpu/pm4_packet_processor.hh
+++ b/src/dev/amdgpu/pm4_packet_processor.hh
@@ -146,8 +146,8 @@ class PM4PacketProcessor : public DmaVirtDevice
     void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
     void doneMQDWrite(Addr mqdAddr, Addr addr);
     void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases);
-    void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt);
-    void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt);
+    void mapProcessV1(PM4Queue *q, PM4MapProcess *pkt);
+    void mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt);
     void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc *mqd,
                     uint16_t vmid);
     void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 41ff9e7893..8cb40f1c87 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -45,7 +45,7 @@ class PrefetchType(Enum):
 
 
 class GfxVersion(ScopedEnum):
-    vals = ["gfx900", "gfx902", "gfx908", "gfx90a"]
+    vals = ["gfx900", "gfx902", "gfx908", "gfx90a", "gfx942"]
 
 
 class PoolManager(SimObject):
diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh
index f015b091fc..44de1a8d32 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -94,9 +94,10 @@ class HSAQueueEntry
         // LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
         //     #code-object-v3-kernel-descriptor
         //
-        // Currently, the only supported gfx version in gem5 that computes
-        // VGPR count differently is gfx90a.
-        if (gfx_version == GfxVersion::gfx90a) {
+        // Currently, the only supported gfx versions in gem5 that compute
+        // VGPR count differently are gfx90a and gfx942.
+        if (gfx_version == GfxVersion::gfx90a ||
+            gfx_version == GfxVersion::gfx942) {
             numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
         } else {
             numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
@@ -107,7 +108,8 @@ class HSAQueueEntry
         if (gfx_version == GfxVersion::gfx900 ||
                 gfx_version == GfxVersion::gfx902 ||
                 gfx_version == GfxVersion::gfx908 ||
-                gfx_version == GfxVersion::gfx90a) {
+                gfx_version == GfxVersion::gfx90a ||
+                gfx_version == GfxVersion::gfx942) {
             numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
         } else {
             panic("Saw unknown gfx version setting up GPR counts\n");
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 98d882b20e..b5298bad4c 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -442,7 +442,8 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
     // Default to false and set to true for gem5 supported ISAs.
     bool packed_work_item_id = false;
 
-    if (task->gfxVersion() == GfxVersion::gfx90a) {
+    if (task->gfxVersion() == GfxVersion::gfx90a ||
+        task->gfxVersion() == GfxVersion::gfx942) {
         packed_work_item_id = true;
     }