From 8be5ce6fc96847a2910bfff8c966586dc8a4dddb Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 15 May 2024 10:49:05 -0700
Subject: [PATCH 1/3] dev-amdgpu,configs,gpu-compute: Add gfx942 version

This is the version for MI300. For the most part, it is the same as
MI200 with the exception of architected flat scratch (not yet
implemented in gem5) and therefore a new version enum is required.

Change-Id: Id18cd7b57c4eebd467c010a3f61e3117beb8d58a
---
 configs/example/gpufs/runfs.py         |  6 +++---
 configs/example/gpufs/system/amdgpu.py |  6 +++++-
 configs/example/gpufs/system/system.py |  2 +-
 src/dev/amdgpu/amdgpu_device.cc        |  9 ++++++++-
 src/dev/amdgpu/pm4_defines.hh          |  4 ++--
 src/dev/amdgpu/pm4_packet_processor.cc | 17 +++++++++--------
 src/dev/amdgpu/pm4_packet_processor.hh |  4 ++--
 src/gpu-compute/GPU.py                 |  2 +-
 src/gpu-compute/hsa_queue_entry.hh     | 10 ++++++----
 src/gpu-compute/wavefront.cc           |  3 ++-
 10 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index fed155bc44..866fa89822 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -134,9 +134,9 @@ def addRunFSOptions(parser):
     parser.add_argument(
         "--gpu-device",
         default="Vega10",
-        choices=["Vega10", "MI100", "MI200"],
-        help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or "
-        "MI200 (gfx90a)",
+        choices=["Vega10", "MI100", "MI200", "MI300X"],
+        help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), MI200 "
+        "(gfx90a), or MI300X (gfx942).",
     )
 
     parser.add_argument(
diff --git a/configs/example/gpufs/system/amdgpu.py b/configs/example/gpufs/system/amdgpu.py
index 0813759e2a..55937cd255 100644
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -191,10 +191,14 @@ def connectGPU(system, args):
         system.pc.south_bridge.gpu.DeviceID = 0x740F
         system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
         system.pc.south_bridge.gpu.SubsystemID = 0x0C34
+    elif args.gpu_device == "MI300X":
+        system.pc.south_bridge.gpu.DeviceID = 0x740F
+        system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
+        system.pc.south_bridge.gpu.SubsystemID = 0x0C34
     elif args.gpu_device == "Vega10":
         system.pc.south_bridge.gpu.DeviceID = 0x6863
     else:
-        panic(f"Unknown GPU device: {args.gpu_device}")
+        m5.util.panic(f"Unknown GPU device: {args.gpu_device}")
 
     # Use the gem5 default of 0x280 OR'd  with 0x10 which tells Linux there is
     # a PCI capabilities list to travse.
diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py
index 7c596f0ccf..1322650964 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -161,7 +161,7 @@ def makeGpuFSSystem(args):
             0x7D000,
         ]
         sdma_sizes = [0x1000] * 8
-    elif args.gpu_device == "MI200":
+    elif args.gpu_device == "MI200" or args.gpu_device == "MI300X":
         num_sdmas = 5
         sdma_bases = [
             0x4980,
diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index 6bb5f9c2c5..b3a91830fe 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -86,6 +86,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
         gfx_version = GfxVersion::gfx908;
     } else if (p.device_name == "MI200") {
         gfx_version = GfxVersion::gfx90a;
+    } else if (p.device_name == "MI300X") {
+        gfx_version = GfxVersion::gfx942;
     } else {
         panic("Unknown GPU device %s\n", p.device_name);
     }
@@ -124,7 +126,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
         sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
         sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
         sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});
-    } else if (p.device_name == "MI100" || p.device_name == "MI200") {
+    } else if (p.device_name == "MI100" || p.device_name == "MI200"
+            || p.device_name == "MI300X") {
         sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
         sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
         sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
@@ -195,6 +198,10 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
         setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
         setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
         setRegVal(MI200_MEM_SIZE_REG, mem_size);
+    } else if (p.device_name == "MI300X") {
+        setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
+        setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
+        setRegVal(MI200_MEM_SIZE_REG, mem_size);
     } else {
         panic("Unknown GPU device %s\n", p.device_name);
     }
diff --git a/src/dev/amdgpu/pm4_defines.hh b/src/dev/amdgpu/pm4_defines.hh
index a303f8ef84..d00dc3730d 100644
--- a/src/dev/amdgpu/pm4_defines.hh
+++ b/src/dev/amdgpu/pm4_defines.hh
@@ -328,8 +328,8 @@ typedef struct GEM5_PACKED
         };
         uint64_t completionSignal;
     };
-}  PM4MapProcessMI200;
-static_assert(sizeof(PM4MapProcessMI200) == 80);
+}  PM4MapProcessV2;
+static_assert(sizeof(PM4MapProcessV2) == 80);
 
 typedef struct GEM5_PACKED
 {
diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index a921942678..9a8ba13914 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -290,18 +290,19 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
                     dmaBuffer);
         } break;
       case IT_MAP_PROCESS: {
-        if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) {
-            dmaBuffer = new PM4MapProcessMI200();
+        if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a ||
+            gpuDevice->getGfxVersion() == GfxVersion::gfx942) {
+            dmaBuffer = new PM4MapProcessV2();
             cb = new DmaVirtCallback<uint64_t>(
                 [ = ] (const uint64_t &)
-                    { mapProcessGfx90a(q, (PM4MapProcessMI200 *)dmaBuffer); });
-            dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200),
+                    { mapProcessV2(q, (PM4MapProcessV2 *)dmaBuffer); });
+            dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessV2),
                         cb, dmaBuffer);
         } else {
             dmaBuffer = new PM4MapProcess();
             cb = new DmaVirtCallback<uint64_t>(
                 [ = ] (const uint64_t &)
-                    { mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); });
+                    { mapProcessV1(q, (PM4MapProcess *)dmaBuffer); });
             dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
                         dmaBuffer);
         }
@@ -701,7 +702,7 @@ PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase,
 }
 
 void
-PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
+PM4PacketProcessor::mapProcessV1(PM4Queue *q, PM4MapProcess *pkt)
 {
     q->incRptr(sizeof(PM4MapProcess));
 
@@ -716,9 +717,9 @@ PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
 }
 
 void
-PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt)
+PM4PacketProcessor::mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt)
 {
-    q->incRptr(sizeof(PM4MapProcessMI200));
+    q->incRptr(sizeof(PM4MapProcessV2));
 
     DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
             "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh
index 82c3c2716f..71271415fd 100644
--- a/src/dev/amdgpu/pm4_packet_processor.hh
+++ b/src/dev/amdgpu/pm4_packet_processor.hh
@@ -146,8 +146,8 @@ class PM4PacketProcessor : public DmaVirtDevice
     void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
     void doneMQDWrite(Addr mqdAddr, Addr addr);
     void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases);
-    void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt);
-    void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt);
+    void mapProcessV1(PM4Queue *q, PM4MapProcess *pkt);
+    void mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt);
     void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc *mqd,
                     uint16_t vmid);
     void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 41ff9e7893..8cb40f1c87 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -45,7 +45,7 @@ class PrefetchType(Enum):
 
 
 class GfxVersion(ScopedEnum):
-    vals = ["gfx900", "gfx902", "gfx908", "gfx90a"]
+    vals = ["gfx900", "gfx902", "gfx908", "gfx90a", "gfx942"]
 
 
 class PoolManager(SimObject):
diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh
index f015b091fc..44de1a8d32 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -94,9 +94,10 @@ class HSAQueueEntry
         // LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
         //     #code-object-v3-kernel-descriptor
         //
-        // Currently, the only supported gfx version in gem5 that computes
-        // VGPR count differently is gfx90a.
-        if (gfx_version == GfxVersion::gfx90a) {
+        // Currently, the only supported gfx versions in gem5 that compute
+        // VGPR count differently are gfx90a and gfx942.
+        if (gfx_version == GfxVersion::gfx90a ||
+            gfx_version == GfxVersion::gfx942) {
             numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
         } else {
             numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
@@ -107,7 +108,8 @@ class HSAQueueEntry
         if (gfx_version == GfxVersion::gfx900 ||
                 gfx_version == GfxVersion::gfx902 ||
                 gfx_version == GfxVersion::gfx908 ||
-                gfx_version == GfxVersion::gfx90a) {
+                gfx_version == GfxVersion::gfx90a ||
+                gfx_version == GfxVersion::gfx942) {
             numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
         } else {
             panic("Saw unknown gfx version setting up GPR counts\n");
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 98d882b20e..b5298bad4c 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -442,7 +442,8 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
     // Default to false and set to true for gem5 supported ISAs.
     bool packed_work_item_id = false;
 
-    if (task->gfxVersion() == GfxVersion::gfx90a) {
+    if (task->gfxVersion() == GfxVersion::gfx90a ||
+        task->gfxVersion() == GfxVersion::gfx942) {
         packed_work_item_id = true;
     }
 

From c1803eafaccf33130b87f1bb6231103f7ac1bf63 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 15 May 2024 12:00:47 -0700
Subject: [PATCH 2/3] arch-vega: Architected flat scratch and scratch insts

Architected flat scratch is added in MI300 which store the scratch base
address in dedicated registers rather than in SGPRs. These registers are
used by scratch_ instructions. These are flat instruction which
explicitly target the private memory aperture. These instructions have a
different address calculation than global_ instructions.

This change implements architected flat scratch support, fixes the
address calculation of scratch_ instructions, and implements decodings
for some scratch_ instructions. Previous flat_ instructions which happen
to access the private memory aperture have no change in address
calculation. Since scratch_ instructions are identical to flat_
instruction except for address calculation, the decodings simply reuse
existing flat_ instruction definitions.

Change-Id: I1e1d15a2fbcc7a4a678157c35608f4f22b359e21
---
 src/arch/amdgpu/vega/gpu_decoder.cc        | 24 +++----
 src/arch/amdgpu/vega/insts/op_encodings.hh | 74 ++++++++++++++++++-
 src/gpu-compute/gpu_dyn_inst.cc            | 82 +++++++++++++++-------
 src/gpu-compute/gpu_static_inst.hh         |  3 +-
 src/gpu-compute/wavefront.cc               | 22 ++++++
 src/gpu-compute/wavefront.hh               |  2 +
 6 files changed, 160 insertions(+), 47 deletions(-)

diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index 6f34301f48..eb5a5bb309 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -9922,29 +9922,25 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORD(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORD(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX2(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX2(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX3(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX3(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX4(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX4(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
@@ -9977,29 +9973,25 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORD(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX2(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX2(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX3(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX3(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX4(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX4(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh
index 9ab7b84974..5861f296ff 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -1258,13 +1258,12 @@ namespace VegaISA
             // If saddr = 0x7f there is no scalar reg to read and address will
             // be a 64-bit address. Otherwise, saddr is the reg index for a
             // scalar reg used as the base address for a 32-bit address.
-            if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
-                || isFlat()) {
+            if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
                 ConstVecOperandU64 vbase(gpuDynInst, vaddr);
                 vbase.read();
 
                 calcAddrVgpr(gpuDynInst, vbase, offset);
-            } else {
+            } else if (isFlatGlobal()) {
                 // Assume we are operating in 64-bit mode and read a pair of
                 // SGPRs for the address base.
                 ConstScalarOperandU64 sbase(gpuDynInst, saddr);
@@ -1274,6 +1273,57 @@ namespace VegaISA
                 voffset.read();
 
                 calcAddrSgpr(gpuDynInst, voffset, sbase, offset);
+            // For scratch, saddr = 0x7f there is no scalar reg to read and
+            // a vgpr will be used for address offset. Otherwise, saddr is
+            // the sgpr index holding the address offset. For scratch
+            // instructions the offset GPR is always 32-bits.
+            } else if (saddr != 0x7f) {
+                assert(isFlatScratch());
+
+                ConstScalarOperandU32 soffset(gpuDynInst, saddr);
+                soffset.read();
+
+                Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
+
+                int elemSize;
+                auto staticInst = gpuDynInst->staticInstruction();
+                if (gpuDynInst->isLoad()) {
+                    elemSize = staticInst->getOperandSize(2);
+                } else {
+                    assert(gpuDynInst->isStore());
+                    elemSize = staticInst->getOperandSize(1);
+                }
+
+                unsigned swizzleOffset = soffset.rawData() + offset;
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        gpuDynInst->addr.at(lane) = flat_scratch_addr
+                            + swizzle(swizzleOffset, lane, elemSize);
+                    }
+                }
+            } else {
+                assert(isFlatScratch());
+
+                ConstVecOperandU32 voffset(gpuDynInst, vaddr);
+                voffset.read();
+
+                Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
+
+                int elemSize;
+                auto staticInst = gpuDynInst->staticInstruction();
+                if (gpuDynInst->isLoad()) {
+                    elemSize = staticInst->getOperandSize(2);
+                } else {
+                    assert(gpuDynInst->isStore());
+                    elemSize = staticInst->getOperandSize(1);
+                }
+
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        gpuDynInst->addr.at(lane) = flat_scratch_addr
+                            + swizzle(voffset[lane] + offset, lane, elemSize);
+                    }
+                }
             }
 
             if (isFlat()) {
@@ -1285,6 +1335,7 @@ namespace VegaISA
                 assert(isFlatScratch());
                 gpuDynInst->staticInstruction()->executed_as =
                     enums::SC_PRIVATE;
+                gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
             }
         }
 
@@ -1421,6 +1472,23 @@ namespace VegaISA
                 }
             }
         }
+
+        VecElemU32
+        swizzle(VecElemU32 offset, int lane, int elem_size)
+        {
+            // This is not described in the spec. We use the swizzle from
+            // buffer memory instructions and fix the stride to 4. Multiply
+            // the thread ID by the storage size to avoid threads clobbering
+            // their data.
+            return ((offset / 4) * 4 * 64)
+                + (offset % 4) + (lane * elem_size);
+        }
+
+        Addr
+        readFlatScratch(GPUDynInstPtr gpuDynInst)
+        {
+            return gpuDynInst->computeUnit()->shader->getScratchBase();
+        }
     }; // Inst_FLAT
 } // namespace VegaISA
 } // namespace gem5
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 66b2b8ec49..80f18d2fa2 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -910,35 +910,63 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
          *     #flat-addressing
          */
 
-        uint32_t numSgprs = wavefront()->maxSgprs;
-        uint32_t physSgprIdx =
-            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 4);
-        uint32_t offset =
-            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
-        physSgprIdx =
-            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 3);
-        uint32_t size =
-            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
-        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
-            if (mask[lane]) {
-                addr[lane] = addr[lane] + lane * size + offset +
-                    wavefront()->computeUnit->shader->getHiddenPrivateBase() -
-                    wavefront()->computeUnit->shader->getScratchBase();
+        ComputeUnit *cu = wavefront()->computeUnit;
+
+        if (wavefront()->gfxVersion == GfxVersion::gfx942) {
+            // Architected flat scratch base address in FLAT_SCRATCH registers
+            uint32_t fs_lo = cu->srf[simdId]->read(
+                VegaISA::REG_FLAT_SCRATCH_LO);
+            uint32_t fs_hi = cu->srf[simdId]->read(
+                VegaISA::REG_FLAT_SCRATCH_HI);
+
+            Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo;
+
+            for (int lane = 0; lane < cu->wfSize(); ++lane) {
+                if (mask[lane]) {
+                    // The scratch base is added for other gfx versions,
+                    // otherwise this would simply add the register base.
+                    addr[lane] = addr[lane] - cu->shader->getScratchBase()
+                        + arch_flat_scratch;
+                }
+            }
+        } else {
+            // In absolute flat scratch the program needs to place scratch
+            // address in SGPRn-3,4.
+            uint32_t numSgprs = wavefront()->maxSgprs;
+            uint32_t physSgprIdx =
+                cu->registerManager->mapSgpr(wavefront(), numSgprs - 4);
+            uint32_t offset = cu->srf[simdId]->read(physSgprIdx);
+            physSgprIdx =
+                cu->registerManager->mapSgpr(wavefront(), numSgprs - 3);
+            uint32_t size = cu->srf[simdId]->read(physSgprIdx);
+
+
+            for (int lane = 0; lane < cu->wfSize(); ++lane) {
+                if (mask[lane]) {
+                    addr[lane] = addr[lane] + lane * size + offset +
+                        cu->shader->getHiddenPrivateBase() -
+                        cu->shader->getScratchBase();
+                }
             }
         }
-        wavefront()->execUnitId =  wavefront()->flatLmUnitId;
-        wavefront()->decLGKMInstsIssued();
-        if (isLoad()) {
-            wavefront()->rdLmReqsInPipe--;
-        } else if (isStore()) {
-            wavefront()->wrLmReqsInPipe--;
-        } else if (isAtomic() || isMemSync()) {
-            wavefront()->wrLmReqsInPipe--;
-            wavefront()->rdLmReqsInPipe--;
-        } else {
-            panic("Invalid memory operation!\n");
+
+        wavefront()->execUnitId = wavefront()->flatLmUnitId;
+
+        // For FLAT the local memory pipe counters are incremented, but they
+        // are not incremented for explicit scratch_* instructions. Only
+        // decrement these counters if we are explicitly a FLAT instruction.
+        if (isFlat()) {
+            wavefront()->decLGKMInstsIssued();
+            if (isLoad()) {
+                wavefront()->rdLmReqsInPipe--;
+            } else if (isStore()) {
+                wavefront()->wrLmReqsInPipe--;
+            } else if (isAtomic() || isMemSync()) {
+                wavefront()->wrLmReqsInPipe--;
+                wavefront()->rdLmReqsInPipe--;
+            } else {
+                panic("Invalid memory operation!\n");
+            }
         }
     } else {
         for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
index 6132ab2d29..1ec06dc7d3 100644
--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -179,7 +179,8 @@ class GPUStaticInst : public GPUStaticInstFlags
     {
         return _flags[MemoryRef] && (_flags[GlobalSegment] ||
                _flags[PrivateSegment] || _flags[ReadOnlySegment] ||
-               _flags[SpillSegment] || _flags[FlatGlobal]);
+               _flags[SpillSegment] || _flags[FlatGlobal] ||
+               _flags[FlatScratch]);
     }
 
     bool
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index b5298bad4c..de7c2333c2 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -118,6 +118,7 @@ void
 Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
 {
     int regInitIdx = 0;
+    gfxVersion = task->gfxVersion();
 
     // Iterate over all the init fields and check which
     // bits are enabled. Useful information can be found here:
@@ -378,8 +379,29 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
                         wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
                 break;
               case PrivSegWaveByteOffset:
+
+                // For architected flat scratch, this enable is reused to set
+                // the FLAT_SCRATCH register pair to the scratch backing
+                // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
+                if (task->gfxVersion() == GfxVersion::gfx942) {
+                    Addr arch_flat_scratch =
+                        task->amdQueue.scratch_backing_memory_location;
+                    computeUnit->srf[simdId]->write(
+                        VegaISA::REG_FLAT_SCRATCH_HI,
+                        bits(arch_flat_scratch, 63, 32));
+                    computeUnit->srf[simdId]->write(
+                        VegaISA::REG_FLAT_SCRATCH_LO,
+                        bits(arch_flat_scratch, 31, 0));
+
+                    break;
+                }
+
+                // Not architected flat scratch. Write the scratch wavefront
+                // offset: https://llvm.org/docs/AMDGPUUsage.html
+                //              #amdgpu-amdhsa-initial-kernel-execution-state
                 physSgprIdx =
                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
+
                 /**
                   * the compute_tmpring_size_wavesize specifies the number of
                   * kB allocated per wavefront, hence the multiplication by
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index 82035f7d47..b7dff4617b 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -92,6 +92,8 @@ class Wavefront : public SimObject
         S_BARRIER
     };
 
+    // gfx version wavefront is executing
+    GfxVersion gfxVersion;
     // HW slot id where the WF is mapped to inside a SIMD unit
     const int wfSlotId;
     int kernId;

From 61648352301f98360b613619ba7bb64dba52b9bf Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 15 May 2024 17:00:37 -0700
Subject: [PATCH 3/3] configs: GPUFS: MI300X

Add a config capable of simulating MI300X ISA (gfx942). This is similar
to the mi200.py config and uses the same scripts followed by some
tuneable parameters. This config optionally lets the user call the
runMI300GPU function with gem5 resources. This allows for something like
the following before a VIPER stdlib python is available:

```
import mi300
from gem5.resources.resource import obtain_resource

disk = obtain_resource("x86-gpu-fs-img")
kernel = obtain_resource("x86-linux-kernel-5.4.0-105-generic")
app = obtain_resource("square-gpu-test")

mi300.runMI300GPUFS("X86KvmCPU", disk, kernel, app)
```

Tested cold boot config, checkpoint create and restore, and using gem5
resources.

Change-Id: I50a13d7a3d207786b779bf7fd47a5645256b1e6a
---
 configs/example/gpufs/mi300.py | 172 +++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 configs/example/gpufs/mi300.py

diff --git a/configs/example/gpufs/mi300.py b/configs/example/gpufs/mi300.py
new file mode 100644
index 0000000000..9e0e0da622
--- /dev/null
+++ b/configs/example/gpufs/mi300.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+""" This file creates an X86 system with a KVM CPU and GPU device capable of
+running the MI300 ISA (gfx942). Most of this file sets up a runscript which
+will load in a binary, shell script, or python file from the host and run that
+within gem5. Jump to line 146 for list of system parameters to configure.
+"""
+
+import argparse
+import base64
+import os
+import sys
+import tempfile
+from typing import Optional
+
+import runfs
+from amd import AmdGPUOptions
+from common import (
+    GPUTLBOptions,
+    Options,
+)
+from ruby import Ruby
+
+import m5
+
+from gem5.resources.resource import AbstractResource
+
+demo_runscript_without_checkpoint = """\
+export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+export HSA_ENABLE_INTERRUPT=0
+export HCC_AMDGPU_TARGET=gfx942
+export HSA_OVERRIDE_GFX_VERSION="9.4.2"
+dmesg -n8
+cat /proc/cpuinfo
+dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
+if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
+    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
+    /sbin/m5 exit
+fi
+modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
+echo "Running {} {}"
+echo "{}" | base64 -d > myapp
+chmod +x myapp
+./myapp {}
+/sbin/m5 exit
+"""
+
+demo_runscript_with_checkpoint = """\
+export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+export HSA_ENABLE_INTERRUPT=0
+export HCC_AMDGPU_TARGET=gfx942
+export HSA_OVERRIDE_GFX_VERSION="9.4.2"
+dmesg -n8
+dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
+if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
+    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
+    /sbin/m5 exit
+fi
+modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
+echo "Running {} {}"
+echo "{}" | base64 -d > myapp
+chmod +x myapp
+/sbin/m5 checkpoint
+./myapp {}
+/sbin/m5 exit
+"""
+
+
+def addDemoOptions(parser):
+    parser.add_argument(
+        "-a", "--app", default=None, help="GPU application to run"
+    )
+    parser.add_argument(
+        "-o", "--opts", default="", help="GPU application arguments"
+    )
+
+
+def runMI300GPUFS(
+    cpu_type,
+    disk: Optional[AbstractResource] = None,
+    kernel: Optional[AbstractResource] = None,
+    app: Optional[AbstractResource] = None,
+):
+    parser = argparse.ArgumentParser()
+    runfs.addRunFSOptions(parser)
+    Options.addCommonOptions(parser)
+    AmdGPUOptions.addAmdGPUOptions(parser)
+    Ruby.define_options(parser)
+    GPUTLBOptions.tlb_options(parser)
+    addDemoOptions(parser)
+
+    # Parse now so we can override options
+    args = parser.parse_args()
+    demo_runscript = ""
+
+    if disk != None:
+        args.disk_image = disk.get_local_path()
+    if kernel != None:
+        args.kernel = kernel.get_local_path()
+    if app != None:
+        args.app = app.get_local_path()
+
+    # Create temp script to run application
+    if not os.path.isfile(args.app):
+        print("Could not find applcation", args.app)
+        sys.exit(1)
+
+    # Choose runscript Based on whether any checkpointing args are set
+    if args.checkpoint_dir is not None:
+        demo_runscript = demo_runscript_with_checkpoint
+    else:
+        demo_runscript = demo_runscript_without_checkpoint
+
+    with open(os.path.abspath(args.app), "rb") as binfile:
+        encodedBin = base64.b64encode(binfile.read()).decode()
+
+    _, tempRunscript = tempfile.mkstemp()
+    with open(tempRunscript, "w") as b64file:
+        runscriptStr = demo_runscript.format(
+            args.app, args.opts, encodedBin, args.opts
+        )
+        b64file.write(runscriptStr)
+
+    args.script = tempRunscript
+
+    # Defaults for CPU
+    args.cpu_type = "X86KvmCPU"
+    args.mem_size = "8GB"
+
+    # Defaults for MI300X
+    args.gpu_device = "MI300X"
+    args.dgpu_mem_size = "16GB"  # GPU memory size, must be 16GB currently.
+
+    # See: https://rocm.docs.amd.com/en/latest/conceptual/gpu-arch/mi300.html
+    # Topology for one XCD. Number of CUs is approximately 304 / 8, rounded
+    # up to 40 due to gem5 restriction of 4 CUs per SQC / scalar cache.
+    args.num_compute_units = 40
+    args.gpu_topology = "Crossbar"
+
+    # Run gem5
+    runfs.runGpuFSSystem(args)
+
+
+if __name__ == "__m5_main__":
+    runMI300GPUFS("X86KvmCPU")