From c1803eafaccf33130b87f1bb6231103f7ac1bf63 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 15 May 2024 12:00:47 -0700
Subject: [PATCH] arch-vega: Architected flat scratch and scratch insts

Architected flat scratch is added in MI300 which store the scratch base
address in dedicated registers rather than in SGPRs. These registers are
used by scratch_ instructions. These are flat instruction which
explicitly target the private memory aperture. These instructions have a
different address calculation than global_ instructions.

This change implements architected flat scratch support, fixes the
address calculation of scratch_ instructions, and implements decodings
for some scratch_ instructions. Previous flat_ instructions which happen
to access the private memory aperture have no change in address
calculation. Since scratch_ instructions are identical to flat_
instruction except for address calculation, the decodings simply reuse
existing flat_ instruction definitions.

Change-Id: I1e1d15a2fbcc7a4a678157c35608f4f22b359e21
---
 src/arch/amdgpu/vega/gpu_decoder.cc        | 24 +++----
 src/arch/amdgpu/vega/insts/op_encodings.hh | 74 ++++++++++++++++++-
 src/gpu-compute/gpu_dyn_inst.cc            | 82 +++++++++++++++-------
 src/gpu-compute/gpu_static_inst.hh         |  3 +-
 src/gpu-compute/wavefront.cc               | 22 ++++++
 src/gpu-compute/wavefront.hh               |  2 +
 6 files changed, 160 insertions(+), 47 deletions(-)

diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index 6f34301f48..eb5a5bb309 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -9922,29 +9922,25 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORD(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORD(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX2(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX2(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX3(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX3(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX4(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX4(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
@@ -9977,29 +9973,25 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORD(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX2(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX2(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX3(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX3(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX4(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX4(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh
index 9ab7b84974..5861f296ff 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -1258,13 +1258,12 @@ namespace VegaISA
             // If saddr = 0x7f there is no scalar reg to read and address will
             // be a 64-bit address. Otherwise, saddr is the reg index for a
             // scalar reg used as the base address for a 32-bit address.
-            if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
-                || isFlat()) {
+            if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
                 ConstVecOperandU64 vbase(gpuDynInst, vaddr);
                 vbase.read();
 
                 calcAddrVgpr(gpuDynInst, vbase, offset);
-            } else {
+            } else if (isFlatGlobal()) {
                 // Assume we are operating in 64-bit mode and read a pair of
                 // SGPRs for the address base.
                 ConstScalarOperandU64 sbase(gpuDynInst, saddr);
@@ -1274,6 +1273,57 @@ namespace VegaISA
                 voffset.read();
 
                 calcAddrSgpr(gpuDynInst, voffset, sbase, offset);
+            // For scratch, saddr = 0x7f there is no scalar reg to read and
+            // a vgpr will be used for address offset. Otherwise, saddr is
+            // the sgpr index holding the address offset. For scratch
+            // instructions the offset GPR is always 32-bits.
+            } else if (saddr != 0x7f) {
+                assert(isFlatScratch());
+
+                ConstScalarOperandU32 soffset(gpuDynInst, saddr);
+                soffset.read();
+
+                Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
+
+                int elemSize;
+                auto staticInst = gpuDynInst->staticInstruction();
+                if (gpuDynInst->isLoad()) {
+                    elemSize = staticInst->getOperandSize(2);
+                } else {
+                    assert(gpuDynInst->isStore());
+                    elemSize = staticInst->getOperandSize(1);
+                }
+
+                unsigned swizzleOffset = soffset.rawData() + offset;
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        gpuDynInst->addr.at(lane) = flat_scratch_addr
+                            + swizzle(swizzleOffset, lane, elemSize);
+                    }
+                }
+            } else {
+                assert(isFlatScratch());
+
+                ConstVecOperandU32 voffset(gpuDynInst, vaddr);
+                voffset.read();
+
+                Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
+
+                int elemSize;
+                auto staticInst = gpuDynInst->staticInstruction();
+                if (gpuDynInst->isLoad()) {
+                    elemSize = staticInst->getOperandSize(2);
+                } else {
+                    assert(gpuDynInst->isStore());
+                    elemSize = staticInst->getOperandSize(1);
+                }
+
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        gpuDynInst->addr.at(lane) = flat_scratch_addr
+                            + swizzle(voffset[lane] + offset, lane, elemSize);
+                    }
+                }
             }
 
             if (isFlat()) {
@@ -1285,6 +1335,7 @@ namespace VegaISA
                 assert(isFlatScratch());
                 gpuDynInst->staticInstruction()->executed_as =
                     enums::SC_PRIVATE;
+                gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
             }
         }
 
@@ -1421,6 +1472,23 @@ namespace VegaISA
                 }
             }
         }
+
+        VecElemU32
+        swizzle(VecElemU32 offset, int lane, int elem_size)
+        {
+            // This is not described in the spec. We use the swizzle from
+            // buffer memory instructions and fix the stride to 4. Multiply
+            // the thread ID by the storage size to avoid threads clobbering
+            // their data.
+            return ((offset / 4) * 4 * 64)
+                + (offset % 4) + (lane * elem_size);
+        }
+
+        Addr
+        readFlatScratch(GPUDynInstPtr gpuDynInst)
+        {
+            return gpuDynInst->computeUnit()->shader->getScratchBase();
+        }
     }; // Inst_FLAT
 } // namespace VegaISA
 } // namespace gem5
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 66b2b8ec49..80f18d2fa2 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -910,35 +910,63 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
          *     #flat-addressing
          */
 
-        uint32_t numSgprs = wavefront()->maxSgprs;
-        uint32_t physSgprIdx =
-            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 4);
-        uint32_t offset =
-            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
-        physSgprIdx =
-            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 3);
-        uint32_t size =
-            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
-        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
-            if (mask[lane]) {
-                addr[lane] = addr[lane] + lane * size + offset +
-                    wavefront()->computeUnit->shader->getHiddenPrivateBase() -
-                    wavefront()->computeUnit->shader->getScratchBase();
+        ComputeUnit *cu = wavefront()->computeUnit;
+
+        if (wavefront()->gfxVersion == GfxVersion::gfx942) {
+            // Architected flat scratch base address in FLAT_SCRATCH registers
+            uint32_t fs_lo = cu->srf[simdId]->read(
+                VegaISA::REG_FLAT_SCRATCH_LO);
+            uint32_t fs_hi = cu->srf[simdId]->read(
+                VegaISA::REG_FLAT_SCRATCH_HI);
+
+            Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo;
+
+            for (int lane = 0; lane < cu->wfSize(); ++lane) {
+                if (mask[lane]) {
+                    // The scratch base is added for other gfx versions,
+                    // otherwise this would simply add the register base.
+                    addr[lane] = addr[lane] - cu->shader->getScratchBase()
+                        + arch_flat_scratch;
+                }
+            }
+        } else {
+            // In absolute flat scratch the program needs to place scratch
+            // address in SGPRn-3,4.
+            uint32_t numSgprs = wavefront()->maxSgprs;
+            uint32_t physSgprIdx =
+                cu->registerManager->mapSgpr(wavefront(), numSgprs - 4);
+            uint32_t offset = cu->srf[simdId]->read(physSgprIdx);
+            physSgprIdx =
+                cu->registerManager->mapSgpr(wavefront(), numSgprs - 3);
+            uint32_t size = cu->srf[simdId]->read(physSgprIdx);
+
+
+            for (int lane = 0; lane < cu->wfSize(); ++lane) {
+                if (mask[lane]) {
+                    addr[lane] = addr[lane] + lane * size + offset +
+                        cu->shader->getHiddenPrivateBase() -
+                        cu->shader->getScratchBase();
+                }
             }
         }
-        wavefront()->execUnitId =  wavefront()->flatLmUnitId;
-        wavefront()->decLGKMInstsIssued();
-        if (isLoad()) {
-            wavefront()->rdLmReqsInPipe--;
-        } else if (isStore()) {
-            wavefront()->wrLmReqsInPipe--;
-        } else if (isAtomic() || isMemSync()) {
-            wavefront()->wrLmReqsInPipe--;
-            wavefront()->rdLmReqsInPipe--;
-        } else {
-            panic("Invalid memory operation!\n");
+
+        wavefront()->execUnitId = wavefront()->flatLmUnitId;
+
+        // For FLAT the local memory pipe counters are incremented, but they
+        // are not incremented for explicit scratch_* instructions. Only
+        // decrement these counters if we are explicitly a FLAT instruction.
+        if (isFlat()) {
+            wavefront()->decLGKMInstsIssued();
+            if (isLoad()) {
+                wavefront()->rdLmReqsInPipe--;
+            } else if (isStore()) {
+                wavefront()->wrLmReqsInPipe--;
+            } else if (isAtomic() || isMemSync()) {
+                wavefront()->wrLmReqsInPipe--;
+                wavefront()->rdLmReqsInPipe--;
+            } else {
+                panic("Invalid memory operation!\n");
+            }
         }
     } else {
         for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
index 6132ab2d29..1ec06dc7d3 100644
--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -179,7 +179,8 @@ class GPUStaticInst : public GPUStaticInstFlags
     {
         return _flags[MemoryRef] && (_flags[GlobalSegment] ||
                _flags[PrivateSegment] || _flags[ReadOnlySegment] ||
-               _flags[SpillSegment] || _flags[FlatGlobal]);
+               _flags[SpillSegment] || _flags[FlatGlobal] ||
+               _flags[FlatScratch]);
     }
 
     bool
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index b5298bad4c..de7c2333c2 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -118,6 +118,7 @@ void
 Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
 {
     int regInitIdx = 0;
+    gfxVersion = task->gfxVersion();
 
     // Iterate over all the init fields and check which
     // bits are enabled. Useful information can be found here:
@@ -378,8 +379,29 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
                         wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
                 break;
               case PrivSegWaveByteOffset:
+
+                // For architected flat scratch, this enable is reused to set
+                // the FLAT_SCRATCH register pair to the scratch backing
+                // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
+                if (task->gfxVersion() == GfxVersion::gfx942) {
+                    Addr arch_flat_scratch =
+                        task->amdQueue.scratch_backing_memory_location;
+                    computeUnit->srf[simdId]->write(
+                        VegaISA::REG_FLAT_SCRATCH_HI,
+                        bits(arch_flat_scratch, 63, 32));
+                    computeUnit->srf[simdId]->write(
+                        VegaISA::REG_FLAT_SCRATCH_LO,
+                        bits(arch_flat_scratch, 31, 0));
+
+                    break;
+                }
+
+                // Not architected flat scratch. Write the scratch wavefront
+                // offset: https://llvm.org/docs/AMDGPUUsage.html
+                //              #amdgpu-amdhsa-initial-kernel-execution-state
                 physSgprIdx =
                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
+
                 /**
                   * the compute_tmpring_size_wavesize specifies the number of
                   * kB allocated per wavefront, hence the multiplication by
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index 82035f7d47..b7dff4617b 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -92,6 +92,8 @@ class Wavefront : public SimObject
         S_BARRIER
     };
 
+    // gfx version wavefront is executing
+    GfxVersion gfxVersion;
     // HW slot id where the WF is mapped to inside a SIMD unit
     const int wfSlotId;
     int kernId;