From e0379f45262fc582793ed29e88bd6d69a8241196 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Sat, 26 Aug 2023 13:06:54 -0500
Subject: [PATCH 1/3] gpu-compute: Fix flat scratch resource counters

Flat instructions may access memory locations in LDS (scratchpad) and
global (VRAM/framebuffer) and therefore increment both counters when
dispatched. Once the aperture is known, we decrement the counters of the
aperture that was *not* used. This is done incorrectly for scratch /
private flat instruction. Private memory is global and therefore local
memory counters should be decremented.

This commit fixes the counters by changing the global decrements to
local decrements.

Change-Id: I25890446908df72e5469e9dbaba6c984955196cf
---
 src/gpu-compute/gpu_dyn_inst.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 3cbb6f1ff8..991ba2c07a 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -919,12 +919,12 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
         wavefront()->execUnitId =  wavefront()->flatLmUnitId;
         wavefront()->decLGKMInstsIssued();
         if (isLoad()) {
-            wavefront()->rdGmReqsInPipe--;
+            wavefront()->rdLmReqsInPipe--;
         } else if (isStore()) {
-            wavefront()->wrGmReqsInPipe--;
+            wavefront()->wrLmReqsInPipe--;
         } else if (isAtomic() || isMemSync()) {
-            wavefront()->rdGmReqsInPipe--;
-            wavefront()->wrGmReqsInPipe--;
+            wavefront()->wrLmReqsInPipe--;
+            wavefront()->rdLmReqsInPipe--;
         } else {
             panic("Invalid memory operation!\n");
         }

From 4506188e003c7f67e1323f1330da239f22441da7 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Sat, 26 Aug 2023 13:09:28 -0500
Subject: [PATCH 2/3] gpu-compute: Fix private offset/size register indexes

According to the ABI documentation from LLVM, the *low* register of flat
scratch (maxSGPR - 4) is the offset and the high register (maxSGPR - 3)
is size. These are currently backwards, resulting in some gnarly
addresses being generated leading to page fault and/or incorrect data.

This commit fixes this by setting the order correctly.

Change-Id: I0b1d077c49c0ee2a4e59b0f6d85cdb8f17f9be61
---
 src/gpu-compute/gpu_dyn_inst.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 991ba2c07a..8dfae1dc90 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -901,12 +901,12 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
         uint32_t numSgprs = wavefront()->maxSgprs;
         uint32_t physSgprIdx =
             wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 3);
+                                                          numSgprs - 4);
         uint32_t offset =
             wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
         physSgprIdx =
             wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 4);
+                                                          numSgprs - 3);
         uint32_t size =
             wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
         for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {

From 60f071d09a03ced31c2f1c7496222b3007a33bae Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Sat, 26 Aug 2023 13:14:13 -0500
Subject: [PATCH 3/3] gpu-compute,arch-vega: Implement flat scratch insts

Flat scratch instructions (aka private) are the 3rd and final segment of
flat instructions in gfx9 (Vega) and beyond. These are used for things
like spills/fills and thread local storage. This commit enables two
forms of flat scratch instructions: (1) flat_load/flat_store
instructions where the memory address resolves to private memory and (2)
the new scratch_load/scratch_store instructions in Vega. The first are
similar to older generation ISAs where the aperture is unknown until
address translation. The second are instructions guaranteed to go to
private memory.

Since these are very similar to flat global instructions there are
minimal changes needed:

- Ensure a flat instruction is either regular flat, global, XOR scratch
- Rename the global op_encoding methods to GlobalScratch to indicate
  they are for both and are intentionally used.
- Flat instructions in segment 1 output scratch_ in the disassembly
- Flat instruction executed as private use similar mem helpers as global
- Flat scratch cannot be an atomic

This was tested using a modified version of the 'square' application:

template <typename T>
__global__ void
scratch_square(T *C_d, T *A_d, size_t N)
{
    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
    size_t stride = blockDim.x * gridDim.x ;

    volatile int foo; // Volatile ensures scratch / unoptimized code

    for (size_t i=offset; i<N; i+=stride) {
        foo = A_d[i];
        C_d[i] = foo * foo;
    }
}

Change-Id: Icc91a7f67836fa3e759fefe7c1c3f6851528ae7d
---
 src/arch/amdgpu/vega/insts/op_encodings.cc | 25 ++++++++++------
 src/arch/amdgpu/vega/insts/op_encodings.hh | 33 ++++++++++++++++------
 src/gpu-compute/GPUStaticInstFlags.py      |  1 +
 src/gpu-compute/compute_unit.cc            |  2 ++
 src/gpu-compute/gpu_dyn_inst.cc            |  6 ++++
 src/gpu-compute/gpu_dyn_inst.hh            |  1 +
 src/gpu-compute/gpu_static_inst.hh         |  1 +
 src/gpu-compute/scoreboard_check_stage.cc  |  3 +-
 8 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/src/arch/amdgpu/vega/insts/op_encodings.cc b/src/arch/amdgpu/vega/insts/op_encodings.cc
index cc650fbbd0..c934094d9b 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.cc
+++ b/src/arch/amdgpu/vega/insts/op_encodings.cc
@@ -1546,6 +1546,8 @@ namespace VegaISA
         // The SEG field specifies FLAT(0) SCRATCH(1) or GLOBAL(2)
         if (iFmt->SEG == 0) {
             setFlag(Flat);
+        } else if (iFmt->SEG == 1) {
+            setFlag(FlatScratch);
         } else if (iFmt->SEG == 2) {
             setFlag(FlatGlobal);
         } else {
@@ -1573,12 +1575,12 @@ namespace VegaISA
     Inst_FLAT::initOperandInfo()
     {
         // One of the flat subtypes should be specified via flags
-        assert(isFlat() ^ isFlatGlobal());
+        assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch());
 
         if (isFlat()) {
             initFlatOperandInfo();
-        } else if (isFlatGlobal()) {
-            initGlobalOperandInfo();
+        } else if (isFlatGlobal() || isFlatScratch()) {
+            initGlobalScratchOperandInfo();
         } else {
             panic("Unknown flat subtype!\n");
         }
@@ -1622,7 +1624,7 @@ namespace VegaISA
     }
 
     void
-    Inst_FLAT::initGlobalOperandInfo()
+    Inst_FLAT::initGlobalScratchOperandInfo()
     {
         //3 formats:
         // 1 dst + 2 src (load)
@@ -1691,12 +1693,12 @@ namespace VegaISA
     Inst_FLAT::generateDisassembly()
     {
         // One of the flat subtypes should be specified via flags
-        assert(isFlat() ^ isFlatGlobal());
+        assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch());
 
         if (isFlat()) {
             generateFlatDisassembly();
-        } else if (isFlatGlobal()) {
-            generateGlobalDisassembly();
+        } else if (isFlatGlobal() || isFlatScratch()) {
+            generateGlobalScratchDisassembly();
         } else {
             panic("Unknown flat subtype!\n");
         }
@@ -1720,11 +1722,16 @@ namespace VegaISA
     }
 
     void
-    Inst_FLAT::generateGlobalDisassembly()
+    Inst_FLAT::generateGlobalScratchDisassembly()
     {
         // Replace flat_ with global_ in assembly string
         std::string global_opcode = _opcode;
-        global_opcode.replace(0, 4, "global");
+        if (isFlatGlobal()) {
+            global_opcode.replace(0, 4, "global");
+        } else {
+            assert(isFlatScratch());
+            global_opcode.replace(0, 4, "scratch");
+        }
 
         std::stringstream dis_stream;
         dis_stream << global_opcode << " ";
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh
index 0f5f502add..613d78b25e 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -926,7 +926,8 @@ namespace VegaISA
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
+                gpuDynInst->executedAs() == enums::SC_PRIVATE) {
                 initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
             } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
                 Wavefront *wf = gpuDynInst->wavefront();
@@ -944,7 +945,8 @@ namespace VegaISA
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
+                gpuDynInst->executedAs() == enums::SC_PRIVATE) {
                 initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
             } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
                 Wavefront *wf = gpuDynInst->wavefront();
@@ -966,7 +968,8 @@ namespace VegaISA
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
+                gpuDynInst->executedAs() == enums::SC_PRIVATE) {
                 initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
             } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
                 Wavefront *wf = gpuDynInst->wavefront();
@@ -984,7 +987,8 @@ namespace VegaISA
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
+                gpuDynInst->executedAs() == enums::SC_PRIVATE) {
                 initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
             } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
                 Wavefront *wf = gpuDynInst->wavefront();
@@ -1006,6 +1010,10 @@ namespace VegaISA
         void
         initAtomicAccess(GPUDynInstPtr gpuDynInst)
         {
+            // Flat scratch requests may not be atomic according to ISA manual
+            // up to MI200. See MI200 manual Table 45.
+            assert(gpuDynInst->executedAs() != enums::SC_PRIVATE);
+
             if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
                 initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
             } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
@@ -1044,7 +1052,8 @@ namespace VegaISA
             // If saddr = 0x7f there is no scalar reg to read and address will
             // be a 64-bit address. Otherwise, saddr is the reg index for a
             // scalar reg used as the base address for a 32-bit address.
-            if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
+            if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
+                || isFlat()) {
                 ConstVecOperandU64 vbase(gpuDynInst, vaddr);
                 vbase.read();
 
@@ -1063,9 +1072,13 @@ namespace VegaISA
 
             if (isFlat()) {
                 gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
-            } else {
+            } else if (isFlatGlobal()) {
                 gpuDynInst->staticInstruction()->executed_as =
                     enums::SC_GLOBAL;
+            } else {
+                assert(isFlatScratch());
+                gpuDynInst->staticInstruction()->executed_as =
+                    enums::SC_PRIVATE;
             }
         }
 
@@ -1081,7 +1094,9 @@ namespace VegaISA
                 gpuDynInst->computeUnit()->localMemoryPipe
                     .issueRequest(gpuDynInst);
             } else {
-                fatal("Unsupported scope for flat instruction.\n");
+                assert(gpuDynInst->executedAs() == enums::SC_PRIVATE);
+                gpuDynInst->computeUnit()->globalMemoryPipe
+                    .issueRequest(gpuDynInst);
             }
         }
 
@@ -1098,10 +1113,10 @@ namespace VegaISA
 
       private:
         void initFlatOperandInfo();
-        void initGlobalOperandInfo();
+        void initGlobalScratchOperandInfo();
 
         void generateFlatDisassembly();
-        void generateGlobalDisassembly();
+        void generateGlobalScratchDisassembly();
 
         void
         calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr,
diff --git a/src/gpu-compute/GPUStaticInstFlags.py b/src/gpu-compute/GPUStaticInstFlags.py
index b75e2c6c92..3a44d402be 100644
--- a/src/gpu-compute/GPUStaticInstFlags.py
+++ b/src/gpu-compute/GPUStaticInstFlags.py
@@ -54,6 +54,7 @@ class GPUStaticInstFlags(Enum):
         "MemoryRef",  # References memory (load, store, or atomic)
         "Flat",  # Flat memory op
         "FlatGlobal",  # Global memory op
+        "FlatScratch",  # Scratch memory op
         "Load",  # Reads from memory
         "Store",  # Writes to memory
         # Atomic ops
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index ea903455d5..8d6deeb85a 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1917,6 +1917,8 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
             }
         } else if (gpuDynInst->isFlatGlobal()) {
             stats.flatVMemInsts++;
+        } else if (gpuDynInst->isFlatScratch()) {
+            stats.flatVMemInsts++;
         } else if (gpuDynInst->isLocalMem()) {
             stats.ldsNoFlatInsts++;
         } else if (gpuDynInst->isLoad()) {
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 8dfae1dc90..0b394e7e36 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -432,6 +432,12 @@ GPUDynInst::isFlatGlobal() const
     return _staticInst->isFlatGlobal();
 }
 
+bool
+GPUDynInst::isFlatScratch() const
+{
+    return _staticInst->isFlatScratch();
+}
+
 bool
 GPUDynInst::isLoad() const
 {
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index e2884a012a..558cce8431 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -234,6 +234,7 @@ class GPUDynInst : public GPUExecContext
     bool isMemRef() const;
     bool isFlat() const;
     bool isFlatGlobal() const;
+    bool isFlatScratch() const;
     bool isLoad() const;
     bool isStore() const;
 
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
index b86a507dce..156f0e529d 100644
--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -130,6 +130,7 @@ class GPUStaticInst : public GPUStaticInstFlags
     bool isMemRef() const { return _flags[MemoryRef]; }
     bool isFlat() const { return _flags[Flat]; }
     bool isFlatGlobal() const { return _flags[FlatGlobal]; }
+    bool isFlatScratch() const { return _flags[FlatScratch]; }
     bool isLoad() const { return _flags[Load]; }
     bool isStore() const { return _flags[Store]; }
 
diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc
index 3d18260822..b618cab278 100644
--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -154,7 +154,8 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
     if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
          ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
          ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat() ||
-         ii->isFlatGlobal() || ii->isSleep() || ii->isLocalMem())) {
+         ii->isFlatGlobal() || ii->isFlatScratch() || ii->isSleep() ||
+         ii->isLocalMem())) {
         panic("next instruction: %s is of unknown type\n", ii->disassemble());
     }