dev-amdgpu,gpu-compute,configs: MI300X (#1141)

Release of MI300X simulation capability: - Implements the required MI300X features over MI200 (currently only architecture flat scratch). - Make the gpu-compute model use MI200 features when MI300X / gfx942 is configured. - Fix up the scratch_ instructions which are seem to be preferred in debug hipcc builds over buffer_. - Add mi300.py config similar to mi200.py. This config can optionally use resources instead of command line args.
2024-05-17 09:26:04 -07:00
parent 716fe6d31d 6164835230
commit 2b3beb92ff
16 changed files with 371 additions and 71 deletions
--- a/configs/example/gpufs/mi300.py
+++ b/configs/example/gpufs/mi300.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+""" This file creates an X86 system with a KVM CPU and GPU device capable of
+running the MI300 ISA (gfx942). Most of this file sets up a runscript which
+will load in a binary, shell script, or python file from the host and run that
+within gem5. Jump to line 146 for list of system parameters to configure.
+"""
+
+import argparse
+import base64
+import os
+import sys
+import tempfile
+from typing import Optional
+
+import runfs
+from amd import AmdGPUOptions
+from common import (
+    GPUTLBOptions,
+    Options,
+)
+from ruby import Ruby
+
+import m5
+
+from gem5.resources.resource import AbstractResource
+
+demo_runscript_without_checkpoint = """\
+export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+export HSA_ENABLE_INTERRUPT=0
+export HCC_AMDGPU_TARGET=gfx942
+export HSA_OVERRIDE_GFX_VERSION="9.4.2"
+dmesg -n8
+cat /proc/cpuinfo
+dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
+if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
+    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
+    /sbin/m5 exit
+fi
+modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
+echo "Running {} {}"
+echo "{}" | base64 -d > myapp
+chmod +x myapp
+./myapp {}
+/sbin/m5 exit
+"""
+
+demo_runscript_with_checkpoint = """\
+export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+export HSA_ENABLE_INTERRUPT=0
+export HCC_AMDGPU_TARGET=gfx942
+export HSA_OVERRIDE_GFX_VERSION="9.4.2"
+dmesg -n8
+dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
+if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
+    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
+    /sbin/m5 exit
+fi
+modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
+echo "Running {} {}"
+echo "{}" | base64 -d > myapp
+chmod +x myapp
+/sbin/m5 checkpoint
+./myapp {}
+/sbin/m5 exit
+"""
+
+
+def addDemoOptions(parser):
+    parser.add_argument(
+        "-a", "--app", default=None, help="GPU application to run"
+    )
+    parser.add_argument(
+        "-o", "--opts", default="", help="GPU application arguments"
+    )
+
+
+def runMI300GPUFS(
+    cpu_type,
+    disk: Optional[AbstractResource] = None,
+    kernel: Optional[AbstractResource] = None,
+    app: Optional[AbstractResource] = None,
+):
+    parser = argparse.ArgumentParser()
+    runfs.addRunFSOptions(parser)
+    Options.addCommonOptions(parser)
+    AmdGPUOptions.addAmdGPUOptions(parser)
+    Ruby.define_options(parser)
+    GPUTLBOptions.tlb_options(parser)
+    addDemoOptions(parser)
+
+    # Parse now so we can override options
+    args = parser.parse_args()
+    demo_runscript = ""
+
+    if disk != None:
+        args.disk_image = disk.get_local_path()
+    if kernel != None:
+        args.kernel = kernel.get_local_path()
+    if app != None:
+        args.app = app.get_local_path()
+
+    # Create temp script to run application
+    if not os.path.isfile(args.app):
+        print("Could not find applcation", args.app)
+        sys.exit(1)
+
+    # Choose runscript Based on whether any checkpointing args are set
+    if args.checkpoint_dir is not None:
+        demo_runscript = demo_runscript_with_checkpoint
+    else:
+        demo_runscript = demo_runscript_without_checkpoint
+
+    with open(os.path.abspath(args.app), "rb") as binfile:
+        encodedBin = base64.b64encode(binfile.read()).decode()
+
+    _, tempRunscript = tempfile.mkstemp()
+    with open(tempRunscript, "w") as b64file:
+        runscriptStr = demo_runscript.format(
+            args.app, args.opts, encodedBin, args.opts
+        )
+        b64file.write(runscriptStr)
+
+    args.script = tempRunscript
+
+    # Defaults for CPU
+    args.cpu_type = "X86KvmCPU"
+    args.mem_size = "8GB"
+
+    # Defaults for MI300X
+    args.gpu_device = "MI300X"
+    args.dgpu_mem_size = "16GB"  # GPU memory size, must be 16GB currently.
+
+    # See: https://rocm.docs.amd.com/en/latest/conceptual/gpu-arch/mi300.html
+    # Topology for one XCD. Number of CUs is approximately 304 / 8, rounded
+    # up to 40 due to gem5 restriction of 4 CUs per SQC / scalar cache.
+    args.num_compute_units = 40
+    args.gpu_topology = "Crossbar"
+
+    # Run gem5
+    runfs.runGpuFSSystem(args)
+
+
+if __name__ == "__m5_main__":
+    runMI300GPUFS("X86KvmCPU")
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -134,9 +134,9 @@ def addRunFSOptions(parser):
    parser.add_argument(
        "--gpu-device",
        default="Vega10",
-        choices=["Vega10", "MI100", "MI200"],
-        help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or "
-        "MI200 (gfx90a)",
+        choices=["Vega10", "MI100", "MI200", "MI300X"],
+        help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), MI200 "
+        "(gfx90a), or MI300X (gfx942).",
    )

    parser.add_argument(
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -191,10 +191,14 @@ def connectGPU(system, args):
        system.pc.south_bridge.gpu.DeviceID = 0x740F
        system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
        system.pc.south_bridge.gpu.SubsystemID = 0x0C34
+    elif args.gpu_device == "MI300X":
+        system.pc.south_bridge.gpu.DeviceID = 0x740F
+        system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
+        system.pc.south_bridge.gpu.SubsystemID = 0x0C34
    elif args.gpu_device == "Vega10":
        system.pc.south_bridge.gpu.DeviceID = 0x6863
    else:
-        panic(f"Unknown GPU device: {args.gpu_device}")
+        m5.util.panic(f"Unknown GPU device: {args.gpu_device}")

    # Use the gem5 default of 0x280 OR'd  with 0x10 which tells Linux there is
    # a PCI capabilities list to travse.
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -161,7 +161,7 @@ def makeGpuFSSystem(args):
            0x7D000,
        ]
        sdma_sizes = [0x1000] * 8
-    elif args.gpu_device == "MI200":
+    elif args.gpu_device == "MI200" or args.gpu_device == "MI300X":
        num_sdmas = 5
        sdma_bases = [
            0x4980,
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -9922,29 +9922,25 @@ namespace VegaISA
    GPUStaticInst*
    Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORD(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORD(&iFmt->iFmt_FLAT);
    }

    GPUStaticInst*
    Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX2(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX2(&iFmt->iFmt_FLAT);
    }

    GPUStaticInst*
    Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX3(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX3(&iFmt->iFmt_FLAT);
    }

    GPUStaticInst*
    Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX4(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX4(&iFmt->iFmt_FLAT);
    }

    GPUStaticInst*
@@ -9977,29 +9973,25 @@ namespace VegaISA
    GPUStaticInst*
    Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORD(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT);
    }

    GPUStaticInst*
    Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX2(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX2(&iFmt->iFmt_FLAT);
    }

    GPUStaticInst*
    Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX3(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX3(&iFmt->iFmt_FLAT);
    }

    GPUStaticInst*
    Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX4(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX4(&iFmt->iFmt_FLAT);
    }

    GPUStaticInst*
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -1258,13 +1258,12 @@ namespace VegaISA
            // If saddr = 0x7f there is no scalar reg to read and address will
            // be a 64-bit address. Otherwise, saddr is the reg index for a
            // scalar reg used as the base address for a 32-bit address.
-            if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
-                || isFlat()) {
+            if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
                ConstVecOperandU64 vbase(gpuDynInst, vaddr);
                vbase.read();

                calcAddrVgpr(gpuDynInst, vbase, offset);
-            } else {
+            } else if (isFlatGlobal()) {
                // Assume we are operating in 64-bit mode and read a pair of
                // SGPRs for the address base.
                ConstScalarOperandU64 sbase(gpuDynInst, saddr);
@@ -1274,6 +1273,57 @@ namespace VegaISA
                voffset.read();

                calcAddrSgpr(gpuDynInst, voffset, sbase, offset);
+            // For scratch, saddr = 0x7f there is no scalar reg to read and
+            // a vgpr will be used for address offset. Otherwise, saddr is
+            // the sgpr index holding the address offset. For scratch
+            // instructions the offset GPR is always 32-bits.
+            } else if (saddr != 0x7f) {
+                assert(isFlatScratch());
+
+                ConstScalarOperandU32 soffset(gpuDynInst, saddr);
+                soffset.read();
+
+                Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
+
+                int elemSize;
+                auto staticInst = gpuDynInst->staticInstruction();
+                if (gpuDynInst->isLoad()) {
+                    elemSize = staticInst->getOperandSize(2);
+                } else {
+                    assert(gpuDynInst->isStore());
+                    elemSize = staticInst->getOperandSize(1);
+                }
+
+                unsigned swizzleOffset = soffset.rawData() + offset;
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        gpuDynInst->addr.at(lane) = flat_scratch_addr
+                            + swizzle(swizzleOffset, lane, elemSize);
+                    }
+                }
+            } else {
+                assert(isFlatScratch());
+
+                ConstVecOperandU32 voffset(gpuDynInst, vaddr);
+                voffset.read();
+
+                Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
+
+                int elemSize;
+                auto staticInst = gpuDynInst->staticInstruction();
+                if (gpuDynInst->isLoad()) {
+                    elemSize = staticInst->getOperandSize(2);
+                } else {
+                    assert(gpuDynInst->isStore());
+                    elemSize = staticInst->getOperandSize(1);
+                }
+
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        gpuDynInst->addr.at(lane) = flat_scratch_addr
+                            + swizzle(voffset[lane] + offset, lane, elemSize);
+                    }
+                }
            }

            if (isFlat()) {
@@ -1285,6 +1335,7 @@ namespace VegaISA
                assert(isFlatScratch());
                gpuDynInst->staticInstruction()->executed_as =
                    enums::SC_PRIVATE;
+                gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
            }
        }

@@ -1421,6 +1472,23 @@ namespace VegaISA
                }
            }
        }
+
+        VecElemU32
+        swizzle(VecElemU32 offset, int lane, int elem_size)
+        {
+            // This is not described in the spec. We use the swizzle from
+            // buffer memory instructions and fix the stride to 4. Multiply
+            // the thread ID by the storage size to avoid threads clobbering
+            // their data.
+            return ((offset / 4) * 4 * 64)
+                + (offset % 4) + (lane * elem_size);
+        }
+
+        Addr
+        readFlatScratch(GPUDynInstPtr gpuDynInst)
+        {
+            return gpuDynInst->computeUnit()->shader->getScratchBase();
+        }
    }; // Inst_FLAT
 } // namespace VegaISA
 } // namespace gem5
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -86,6 +86,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
        gfx_version = GfxVersion::gfx908;
    } else if (p.device_name == "MI200") {
        gfx_version = GfxVersion::gfx90a;
+    } else if (p.device_name == "MI300X") {
+        gfx_version = GfxVersion::gfx942;
    } else {
        panic("Unknown GPU device %s\n", p.device_name);
    }
@@ -124,7 +126,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
        sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
        sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
        sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});
-    } else if (p.device_name == "MI100" || p.device_name == "MI200") {
+    } else if (p.device_name == "MI100" || p.device_name == "MI200"
+            || p.device_name == "MI300X") {
        sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
        sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
        sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
@@ -195,6 +198,10 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
        setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
        setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
        setRegVal(MI200_MEM_SIZE_REG, mem_size);
+    } else if (p.device_name == "MI300X") {
+        setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
+        setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
+        setRegVal(MI200_MEM_SIZE_REG, mem_size);
    } else {
        panic("Unknown GPU device %s\n", p.device_name);
    }
--- a/src/dev/amdgpu/pm4_defines.hh
+++ b/src/dev/amdgpu/pm4_defines.hh
@@ -328,8 +328,8 @@ typedef struct GEM5_PACKED
        };
        uint64_t completionSignal;
    };
-}  PM4MapProcessMI200;
-static_assert(sizeof(PM4MapProcessMI200) == 80);
+}  PM4MapProcessV2;
+static_assert(sizeof(PM4MapProcessV2) == 80);

 typedef struct GEM5_PACKED
 {
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -290,18 +290,19 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
                    dmaBuffer);
        } break;
      case IT_MAP_PROCESS: {
-        if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) {
-            dmaBuffer = new PM4MapProcessMI200();
+        if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a ||
+            gpuDevice->getGfxVersion() == GfxVersion::gfx942) {
+            dmaBuffer = new PM4MapProcessV2();
            cb = new DmaVirtCallback<uint64_t>(
                [ = ] (const uint64_t &)
-                    { mapProcessGfx90a(q, (PM4MapProcessMI200 *)dmaBuffer); });
-            dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200),
+                    { mapProcessV2(q, (PM4MapProcessV2 *)dmaBuffer); });
+            dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessV2),
                        cb, dmaBuffer);
        } else {
            dmaBuffer = new PM4MapProcess();
            cb = new DmaVirtCallback<uint64_t>(
                [ = ] (const uint64_t &)
-                    { mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); });
+                    { mapProcessV1(q, (PM4MapProcess *)dmaBuffer); });
            dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
                        dmaBuffer);
        }
@@ -701,7 +702,7 @@ PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase,
 }

 void
-PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
+PM4PacketProcessor::mapProcessV1(PM4Queue *q, PM4MapProcess *pkt)
 {
    q->incRptr(sizeof(PM4MapProcess));

@@ -716,9 +717,9 @@ PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
 }

 void
-PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt)
+PM4PacketProcessor::mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt)
 {
-    q->incRptr(sizeof(PM4MapProcessMI200));
+    q->incRptr(sizeof(PM4MapProcessV2));

    DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
            "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
--- a/src/dev/amdgpu/pm4_packet_processor.hh
+++ b/src/dev/amdgpu/pm4_packet_processor.hh
@@ -146,8 +146,8 @@ class PM4PacketProcessor : public DmaVirtDevice
    void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
    void doneMQDWrite(Addr mqdAddr, Addr addr);
    void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases);
-    void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt);
-    void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt);
+    void mapProcessV1(PM4Queue *q, PM4MapProcess *pkt);
+    void mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt);
    void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc *mqd,
                    uint16_t vmid);
    void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -45,7 +45,7 @@ class PrefetchType(Enum):


 class GfxVersion(ScopedEnum):
-    vals = ["gfx900", "gfx902", "gfx908", "gfx90a"]
+    vals = ["gfx900", "gfx902", "gfx908", "gfx90a", "gfx942"]


 class PoolManager(SimObject):
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -910,35 +910,63 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
         *     #flat-addressing
         */

-        uint32_t numSgprs = wavefront()->maxSgprs;
-        uint32_t physSgprIdx =
-            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 4);
-        uint32_t offset =
-            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
-        physSgprIdx =
-            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 3);
-        uint32_t size =
-            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
-        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
-            if (mask[lane]) {
-                addr[lane] = addr[lane] + lane * size + offset +
-                    wavefront()->computeUnit->shader->getHiddenPrivateBase() -
-                    wavefront()->computeUnit->shader->getScratchBase();
+        ComputeUnit *cu = wavefront()->computeUnit;
+
+        if (wavefront()->gfxVersion == GfxVersion::gfx942) {
+            // Architected flat scratch base address in FLAT_SCRATCH registers
+            uint32_t fs_lo = cu->srf[simdId]->read(
+                VegaISA::REG_FLAT_SCRATCH_LO);
+            uint32_t fs_hi = cu->srf[simdId]->read(
+                VegaISA::REG_FLAT_SCRATCH_HI);
+
+            Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo;
+
+            for (int lane = 0; lane < cu->wfSize(); ++lane) {
+                if (mask[lane]) {
+                    // The scratch base is added for other gfx versions,
+                    // otherwise this would simply add the register base.
+                    addr[lane] = addr[lane] - cu->shader->getScratchBase()
+                        + arch_flat_scratch;
+                }
+            }
+        } else {
+            // In absolute flat scratch the program needs to place scratch
+            // address in SGPRn-3,4.
+            uint32_t numSgprs = wavefront()->maxSgprs;
+            uint32_t physSgprIdx =
+                cu->registerManager->mapSgpr(wavefront(), numSgprs - 4);
+            uint32_t offset = cu->srf[simdId]->read(physSgprIdx);
+            physSgprIdx =
+                cu->registerManager->mapSgpr(wavefront(), numSgprs - 3);
+            uint32_t size = cu->srf[simdId]->read(physSgprIdx);
+
+
+            for (int lane = 0; lane < cu->wfSize(); ++lane) {
+                if (mask[lane]) {
+                    addr[lane] = addr[lane] + lane * size + offset +
+                        cu->shader->getHiddenPrivateBase() -
+                        cu->shader->getScratchBase();
+                }
            }
        }
-        wavefront()->execUnitId =  wavefront()->flatLmUnitId;
-        wavefront()->decLGKMInstsIssued();
-        if (isLoad()) {
-            wavefront()->rdLmReqsInPipe--;
-        } else if (isStore()) {
-            wavefront()->wrLmReqsInPipe--;
-        } else if (isAtomic() || isMemSync()) {
-            wavefront()->wrLmReqsInPipe--;
-            wavefront()->rdLmReqsInPipe--;
-        } else {
-            panic("Invalid memory operation!\n");
+
+        wavefront()->execUnitId = wavefront()->flatLmUnitId;
+
+        // For FLAT the local memory pipe counters are incremented, but they
+        // are not incremented for explicit scratch_* instructions. Only
+        // decrement these counters if we are explicitly a FLAT instruction.
+        if (isFlat()) {
+            wavefront()->decLGKMInstsIssued();
+            if (isLoad()) {
+                wavefront()->rdLmReqsInPipe--;
+            } else if (isStore()) {
+                wavefront()->wrLmReqsInPipe--;
+            } else if (isAtomic() || isMemSync()) {
+                wavefront()->wrLmReqsInPipe--;
+                wavefront()->rdLmReqsInPipe--;
+            } else {
+                panic("Invalid memory operation!\n");
+            }
        }
    } else {
        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -179,7 +179,8 @@ class GPUStaticInst : public GPUStaticInstFlags
    {
        return _flags[MemoryRef] && (_flags[GlobalSegment] ||
               _flags[PrivateSegment] || _flags[ReadOnlySegment] ||
-               _flags[SpillSegment] || _flags[FlatGlobal]);
+               _flags[SpillSegment] || _flags[FlatGlobal] ||
+               _flags[FlatScratch]);
    }

    bool
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -94,9 +94,10 @@ class HSAQueueEntry
        // LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
        //     #code-object-v3-kernel-descriptor
        //
-        // Currently, the only supported gfx version in gem5 that computes
-        // VGPR count differently is gfx90a.
-        if (gfx_version == GfxVersion::gfx90a) {
+        // Currently, the only supported gfx versions in gem5 that compute
+        // VGPR count differently are gfx90a and gfx942.
+        if (gfx_version == GfxVersion::gfx90a ||
+            gfx_version == GfxVersion::gfx942) {
            numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
        } else {
            numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
@@ -107,7 +108,8 @@ class HSAQueueEntry
        if (gfx_version == GfxVersion::gfx900 ||
                gfx_version == GfxVersion::gfx902 ||
                gfx_version == GfxVersion::gfx908 ||
-                gfx_version == GfxVersion::gfx90a) {
+                gfx_version == GfxVersion::gfx90a ||
+                gfx_version == GfxVersion::gfx942) {
            numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
        } else {
            panic("Saw unknown gfx version setting up GPR counts\n");
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -118,6 +118,7 @@ void
 Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
 {
    int regInitIdx = 0;
+    gfxVersion = task->gfxVersion();

    // Iterate over all the init fields and check which
    // bits are enabled. Useful information can be found here:
@@ -378,8 +379,29 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
                        wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
                break;
              case PrivSegWaveByteOffset:
+
+                // For architected flat scratch, this enable is reused to set
+                // the FLAT_SCRATCH register pair to the scratch backing
+                // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
+                if (task->gfxVersion() == GfxVersion::gfx942) {
+                    Addr arch_flat_scratch =
+                        task->amdQueue.scratch_backing_memory_location;
+                    computeUnit->srf[simdId]->write(
+                        VegaISA::REG_FLAT_SCRATCH_HI,
+                        bits(arch_flat_scratch, 63, 32));
+                    computeUnit->srf[simdId]->write(
+                        VegaISA::REG_FLAT_SCRATCH_LO,
+                        bits(arch_flat_scratch, 31, 0));
+
+                    break;
+                }
+
+                // Not architected flat scratch. Write the scratch wavefront
+                // offset: https://llvm.org/docs/AMDGPUUsage.html
+                //              #amdgpu-amdhsa-initial-kernel-execution-state
                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+
                /**
                  * the compute_tmpring_size_wavesize specifies the number of
                  * kB allocated per wavefront, hence the multiplication by
@@ -442,7 +464,8 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
    // Default to false and set to true for gem5 supported ISAs.
    bool packed_work_item_id = false;

-    if (task->gfxVersion() == GfxVersion::gfx90a) {
+    if (task->gfxVersion() == GfxVersion::gfx90a ||
+        task->gfxVersion() == GfxVersion::gfx942) {
        packed_work_item_id = true;
    }

--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -92,6 +92,8 @@ class Wavefront : public SimObject
        S_BARRIER
    };

+    // gfx version wavefront is executing
+    GfxVersion gfxVersion;
    // HW slot id where the WF is mapped to inside a SIMD unit
    const int wfSlotId;
    int kernId;