gpu-compute,arch-vega: Implement flat scratch insts

Flat scratch instructions (aka private) are the 3rd and final segment of flat instructions in gfx9 (Vega) and beyond. These are used for things like spills/fills and thread local storage. This commit enables two forms of flat scratch instructions: (1) flat_load/flat_store instructions where the memory address resolves to private memory and (2) the new scratch_load/scratch_store instructions in Vega. The first are similar to older generation ISAs where the aperture is unknown until address translation. The second are instructions guaranteed to go to private memory. Since these are very similar to flat global instructions there are minimal changes needed: - Ensure a flat instruction is either regular flat, global, XOR scratch - Rename the global op_encoding methods to GlobalScratch to indicate they are for both and are intentionally used. - Flat instructions in segment 1 output scratch_ in the disassembly - Flat instruction executed as private use similar mem helpers as global - Flat scratch cannot be an atomic This was tested using a modified version of the 'square' application: template <typename T> __global__ void scratch_square(T *C_d, T *A_d, size_t N) { size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); size_t stride = blockDim.x * gridDim.x ; volatile int foo; // Volatile ensures scratch / unoptimized code for (size_t i=offset; i<N; i+=stride) { foo = A_d[i]; C_d[i] = foo * foo; } } Change-Id: Icc91a7f67836fa3e759fefe7c1c3f6851528ae7d
2023-08-26 13:14:13 -05:00
parent 4506188e00
commit 60f071d09a
8 changed files with 53 additions and 19 deletions
--- a/src/arch/amdgpu/vega/insts/op_encodings.cc
+++ b/src/arch/amdgpu/vega/insts/op_encodings.cc
@@ -1546,6 +1546,8 @@ namespace VegaISA
        // The SEG field specifies FLAT(0) SCRATCH(1) or GLOBAL(2)
        if (iFmt->SEG == 0) {
            setFlag(Flat);
+        } else if (iFmt->SEG == 1) {
+            setFlag(FlatScratch);
        } else if (iFmt->SEG == 2) {
            setFlag(FlatGlobal);
        } else {
@@ -1573,12 +1575,12 @@ namespace VegaISA
    Inst_FLAT::initOperandInfo()
    {
        // One of the flat subtypes should be specified via flags
-        assert(isFlat() ^ isFlatGlobal());
+        assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch());

        if (isFlat()) {
            initFlatOperandInfo();
-        } else if (isFlatGlobal()) {
-            initGlobalOperandInfo();
+        } else if (isFlatGlobal() || isFlatScratch()) {
+            initGlobalScratchOperandInfo();
        } else {
            panic("Unknown flat subtype!\n");
        }
@@ -1622,7 +1624,7 @@ namespace VegaISA
    }

    void
-    Inst_FLAT::initGlobalOperandInfo()
+    Inst_FLAT::initGlobalScratchOperandInfo()
    {
        //3 formats:
        // 1 dst + 2 src (load)
@@ -1691,12 +1693,12 @@ namespace VegaISA
    Inst_FLAT::generateDisassembly()
    {
        // One of the flat subtypes should be specified via flags
-        assert(isFlat() ^ isFlatGlobal());
+        assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch());

        if (isFlat()) {
            generateFlatDisassembly();
-        } else if (isFlatGlobal()) {
-            generateGlobalDisassembly();
+        } else if (isFlatGlobal() || isFlatScratch()) {
+            generateGlobalScratchDisassembly();
        } else {
            panic("Unknown flat subtype!\n");
        }
@@ -1720,11 +1722,16 @@ namespace VegaISA
    }

    void
-    Inst_FLAT::generateGlobalDisassembly()
+    Inst_FLAT::generateGlobalScratchDisassembly()
    {
        // Replace flat_ with global_ in assembly string
        std::string global_opcode = _opcode;
-        global_opcode.replace(0, 4, "global");
+        if (isFlatGlobal()) {
+            global_opcode.replace(0, 4, "global");
+        } else {
+            assert(isFlatScratch());
+            global_opcode.replace(0, 4, "scratch");
+        }

        std::stringstream dis_stream;
        dis_stream << global_opcode << " ";
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -926,7 +926,8 @@ namespace VegaISA
        void
        initMemRead(GPUDynInstPtr gpuDynInst)
        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
+                gpuDynInst->executedAs() == enums::SC_PRIVATE) {
                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
                Wavefront *wf = gpuDynInst->wavefront();
@@ -944,7 +945,8 @@ namespace VegaISA
        void
        initMemRead(GPUDynInstPtr gpuDynInst)
        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
+                gpuDynInst->executedAs() == enums::SC_PRIVATE) {
                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
                Wavefront *wf = gpuDynInst->wavefront();
@@ -966,7 +968,8 @@ namespace VegaISA
        void
        initMemWrite(GPUDynInstPtr gpuDynInst)
        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
+                gpuDynInst->executedAs() == enums::SC_PRIVATE) {
                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
                Wavefront *wf = gpuDynInst->wavefront();
@@ -984,7 +987,8 @@ namespace VegaISA
        void
        initMemWrite(GPUDynInstPtr gpuDynInst)
        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
+                gpuDynInst->executedAs() == enums::SC_PRIVATE) {
                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
                Wavefront *wf = gpuDynInst->wavefront();
@@ -1006,6 +1010,10 @@ namespace VegaISA
        void
        initAtomicAccess(GPUDynInstPtr gpuDynInst)
        {
+            // Flat scratch requests may not be atomic according to ISA manual
+            // up to MI200. See MI200 manual Table 45.
+            assert(gpuDynInst->executedAs() != enums::SC_PRIVATE);
+
            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
@@ -1044,7 +1052,8 @@ namespace VegaISA
            // If saddr = 0x7f there is no scalar reg to read and address will
            // be a 64-bit address. Otherwise, saddr is the reg index for a
            // scalar reg used as the base address for a 32-bit address.
-            if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
+            if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
+                || isFlat()) {
                ConstVecOperandU64 vbase(gpuDynInst, vaddr);
                vbase.read();

@@ -1063,9 +1072,13 @@ namespace VegaISA

            if (isFlat()) {
                gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
-            } else {
+            } else if (isFlatGlobal()) {
                gpuDynInst->staticInstruction()->executed_as =
                    enums::SC_GLOBAL;
+            } else {
+                assert(isFlatScratch());
+                gpuDynInst->staticInstruction()->executed_as =
+                    enums::SC_PRIVATE;
            }
        }

@@ -1081,7 +1094,9 @@ namespace VegaISA
                gpuDynInst->computeUnit()->localMemoryPipe
                    .issueRequest(gpuDynInst);
            } else {
-                fatal("Unsupported scope for flat instruction.\n");
+                assert(gpuDynInst->executedAs() == enums::SC_PRIVATE);
+                gpuDynInst->computeUnit()->globalMemoryPipe
+                    .issueRequest(gpuDynInst);
            }
        }

@@ -1098,10 +1113,10 @@ namespace VegaISA

      private:
        void initFlatOperandInfo();
-        void initGlobalOperandInfo();
+        void initGlobalScratchOperandInfo();

        void generateFlatDisassembly();
-        void generateGlobalDisassembly();
+        void generateGlobalScratchDisassembly();

        void
        calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr,
--- a/src/gpu-compute/GPUStaticInstFlags.py
+++ b/src/gpu-compute/GPUStaticInstFlags.py
@@ -54,6 +54,7 @@ class GPUStaticInstFlags(Enum):
        "MemoryRef",  # References memory (load, store, or atomic)
        "Flat",  # Flat memory op
        "FlatGlobal",  # Global memory op
+        "FlatScratch",  # Scratch memory op
        "Load",  # Reads from memory
        "Store",  # Writes to memory
        # Atomic ops
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1917,6 +1917,8 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
            }
        } else if (gpuDynInst->isFlatGlobal()) {
            stats.flatVMemInsts++;
+        } else if (gpuDynInst->isFlatScratch()) {
+            stats.flatVMemInsts++;
        } else if (gpuDynInst->isLocalMem()) {
            stats.ldsNoFlatInsts++;
        } else if (gpuDynInst->isLoad()) {
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -432,6 +432,12 @@ GPUDynInst::isFlatGlobal() const
    return _staticInst->isFlatGlobal();
 }

+bool
+GPUDynInst::isFlatScratch() const
+{
+    return _staticInst->isFlatScratch();
+}
+
 bool
 GPUDynInst::isLoad() const
 {
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -234,6 +234,7 @@ class GPUDynInst : public GPUExecContext
    bool isMemRef() const;
    bool isFlat() const;
    bool isFlatGlobal() const;
+    bool isFlatScratch() const;
    bool isLoad() const;
    bool isStore() const;

--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -130,6 +130,7 @@ class GPUStaticInst : public GPUStaticInstFlags
    bool isMemRef() const { return _flags[MemoryRef]; }
    bool isFlat() const { return _flags[Flat]; }
    bool isFlatGlobal() const { return _flags[FlatGlobal]; }
+    bool isFlatScratch() const { return _flags[FlatScratch]; }
    bool isLoad() const { return _flags[Load]; }
    bool isStore() const { return _flags[Store]; }

--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -154,7 +154,8 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
    if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
         ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
         ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat() ||
-         ii->isFlatGlobal() || ii->isSleep() || ii->isLocalMem())) {
+         ii->isFlatGlobal() || ii->isFlatScratch() || ii->isSleep() ||
+         ii->isLocalMem())) {
        panic("next instruction: %s is of unknown type\n", ii->disassemble());
    }