gpu-compute: Flat scratch implementation and bug fixes (#231)

Add commits fixing private segment counters, flat scratch address calculation, and implementation of flat scratch instructions. These commits were tested using a modified version of 'square': template <typename T> __global__ void scratch_square(T *C_d, T *A_d, size_t N) { size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); size_t stride = blockDim.x * gridDim.x ; volatile int foo; // Volatile ensures scratch / unoptimized code for (size_t i=offset; i<N; i+=stride) { foo = A_d[i]; C_d[i] = foo * foo; } }
2023-08-27 07:40:24 -07:00
parent a9b32cdb3a 60f071d09a
commit 82ffc16e6e
8 changed files with 59 additions and 25 deletions
--- a/src/arch/amdgpu/vega/insts/op_encodings.cc
+++ b/src/arch/amdgpu/vega/insts/op_encodings.cc
@@ -1546,6 +1546,8 @@ namespace VegaISA
        // The SEG field specifies FLAT(0) SCRATCH(1) or GLOBAL(2)
        if (iFmt->SEG == 0) {
            setFlag(Flat);
+        } else if (iFmt->SEG == 1) {
+            setFlag(FlatScratch);
        } else if (iFmt->SEG == 2) {
            setFlag(FlatGlobal);
        } else {
@@ -1573,12 +1575,12 @@ namespace VegaISA
    Inst_FLAT::initOperandInfo()
    {
        // One of the flat subtypes should be specified via flags
-        assert(isFlat() ^ isFlatGlobal());
+        assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch());

        if (isFlat()) {
            initFlatOperandInfo();
-        } else if (isFlatGlobal()) {
-            initGlobalOperandInfo();
+        } else if (isFlatGlobal() || isFlatScratch()) {
+            initGlobalScratchOperandInfo();
        } else {
            panic("Unknown flat subtype!\n");
        }
@@ -1622,7 +1624,7 @@ namespace VegaISA
    }

    void
-    Inst_FLAT::initGlobalOperandInfo()
+    Inst_FLAT::initGlobalScratchOperandInfo()
    {
        //3 formats:
        // 1 dst + 2 src (load)
@@ -1691,12 +1693,12 @@ namespace VegaISA
    Inst_FLAT::generateDisassembly()
    {
        // One of the flat subtypes should be specified via flags
-        assert(isFlat() ^ isFlatGlobal());
+        assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch());

        if (isFlat()) {
            generateFlatDisassembly();
-        } else if (isFlatGlobal()) {
-            generateGlobalDisassembly();
+        } else if (isFlatGlobal() || isFlatScratch()) {
+            generateGlobalScratchDisassembly();
        } else {
            panic("Unknown flat subtype!\n");
        }
@@ -1720,11 +1722,16 @@ namespace VegaISA
    }

    void
-    Inst_FLAT::generateGlobalDisassembly()
+    Inst_FLAT::generateGlobalScratchDisassembly()
    {
        // Replace flat_ with global_ in assembly string
        std::string global_opcode = _opcode;
-        global_opcode.replace(0, 4, "global");
+        if (isFlatGlobal()) {
+            global_opcode.replace(0, 4, "global");
+        } else {
+            assert(isFlatScratch());
+            global_opcode.replace(0, 4, "scratch");
+        }

        std::stringstream dis_stream;
        dis_stream << global_opcode << " ";
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -926,7 +926,8 @@ namespace VegaISA
        void
        initMemRead(GPUDynInstPtr gpuDynInst)
        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
+                gpuDynInst->executedAs() == enums::SC_PRIVATE) {
                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
                Wavefront *wf = gpuDynInst->wavefront();
@@ -944,7 +945,8 @@ namespace VegaISA
        void
        initMemRead(GPUDynInstPtr gpuDynInst)
        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
+                gpuDynInst->executedAs() == enums::SC_PRIVATE) {
                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
                Wavefront *wf = gpuDynInst->wavefront();
@@ -966,7 +968,8 @@ namespace VegaISA
        void
        initMemWrite(GPUDynInstPtr gpuDynInst)
        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
+                gpuDynInst->executedAs() == enums::SC_PRIVATE) {
                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
                Wavefront *wf = gpuDynInst->wavefront();
@@ -984,7 +987,8 @@ namespace VegaISA
        void
        initMemWrite(GPUDynInstPtr gpuDynInst)
        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
+                gpuDynInst->executedAs() == enums::SC_PRIVATE) {
                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
                Wavefront *wf = gpuDynInst->wavefront();
@@ -1006,6 +1010,10 @@ namespace VegaISA
        void
        initAtomicAccess(GPUDynInstPtr gpuDynInst)
        {
+            // Flat scratch requests may not be atomic according to ISA manual
+            // up to MI200. See MI200 manual Table 45.
+            assert(gpuDynInst->executedAs() != enums::SC_PRIVATE);
+
            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
@@ -1044,7 +1052,8 @@ namespace VegaISA
            // If saddr = 0x7f there is no scalar reg to read and address will
            // be a 64-bit address. Otherwise, saddr is the reg index for a
            // scalar reg used as the base address for a 32-bit address.
-            if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
+            if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
+                || isFlat()) {
                ConstVecOperandU64 vbase(gpuDynInst, vaddr);
                vbase.read();

@@ -1063,9 +1072,13 @@ namespace VegaISA

            if (isFlat()) {
                gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
-            } else {
+            } else if (isFlatGlobal()) {
                gpuDynInst->staticInstruction()->executed_as =
                    enums::SC_GLOBAL;
+            } else {
+                assert(isFlatScratch());
+                gpuDynInst->staticInstruction()->executed_as =
+                    enums::SC_PRIVATE;
            }
        }

@@ -1081,7 +1094,9 @@ namespace VegaISA
                gpuDynInst->computeUnit()->localMemoryPipe
                    .issueRequest(gpuDynInst);
            } else {
-                fatal("Unsupported scope for flat instruction.\n");
+                assert(gpuDynInst->executedAs() == enums::SC_PRIVATE);
+                gpuDynInst->computeUnit()->globalMemoryPipe
+                    .issueRequest(gpuDynInst);
            }
        }

@@ -1098,10 +1113,10 @@ namespace VegaISA

      private:
        void initFlatOperandInfo();
-        void initGlobalOperandInfo();
+        void initGlobalScratchOperandInfo();

        void generateFlatDisassembly();
-        void generateGlobalDisassembly();
+        void generateGlobalScratchDisassembly();

        void
        calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr,
--- a/src/gpu-compute/GPUStaticInstFlags.py
+++ b/src/gpu-compute/GPUStaticInstFlags.py
@@ -54,6 +54,7 @@ class GPUStaticInstFlags(Enum):
        "MemoryRef",  # References memory (load, store, or atomic)
        "Flat",  # Flat memory op
        "FlatGlobal",  # Global memory op
+        "FlatScratch",  # Scratch memory op
        "Load",  # Reads from memory
        "Store",  # Writes to memory
        # Atomic ops
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1917,6 +1917,8 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
            }
        } else if (gpuDynInst->isFlatGlobal()) {
            stats.flatVMemInsts++;
+        } else if (gpuDynInst->isFlatScratch()) {
+            stats.flatVMemInsts++;
        } else if (gpuDynInst->isLocalMem()) {
            stats.ldsNoFlatInsts++;
        } else if (gpuDynInst->isLoad()) {
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -432,6 +432,12 @@ GPUDynInst::isFlatGlobal() const
    return _staticInst->isFlatGlobal();
 }

+bool
+GPUDynInst::isFlatScratch() const
+{
+    return _staticInst->isFlatScratch();
+}
+
 bool
 GPUDynInst::isLoad() const
 {
@@ -901,12 +907,12 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
        uint32_t numSgprs = wavefront()->maxSgprs;
        uint32_t physSgprIdx =
            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 3);
+                                                          numSgprs - 4);
        uint32_t offset =
            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
        physSgprIdx =
            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 4);
+                                                          numSgprs - 3);
        uint32_t size =
            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
@@ -919,12 +925,12 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
        wavefront()->execUnitId =  wavefront()->flatLmUnitId;
        wavefront()->decLGKMInstsIssued();
        if (isLoad()) {
-            wavefront()->rdGmReqsInPipe--;
+            wavefront()->rdLmReqsInPipe--;
        } else if (isStore()) {
-            wavefront()->wrGmReqsInPipe--;
+            wavefront()->wrLmReqsInPipe--;
        } else if (isAtomic() || isMemSync()) {
-            wavefront()->rdGmReqsInPipe--;
-            wavefront()->wrGmReqsInPipe--;
+            wavefront()->wrLmReqsInPipe--;
+            wavefront()->rdLmReqsInPipe--;
        } else {
            panic("Invalid memory operation!\n");
        }
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -234,6 +234,7 @@ class GPUDynInst : public GPUExecContext
    bool isMemRef() const;
    bool isFlat() const;
    bool isFlatGlobal() const;
+    bool isFlatScratch() const;
    bool isLoad() const;
    bool isStore() const;

--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -130,6 +130,7 @@ class GPUStaticInst : public GPUStaticInstFlags
    bool isMemRef() const { return _flags[MemoryRef]; }
    bool isFlat() const { return _flags[Flat]; }
    bool isFlatGlobal() const { return _flags[FlatGlobal]; }
+    bool isFlatScratch() const { return _flags[FlatScratch]; }
    bool isLoad() const { return _flags[Load]; }
    bool isStore() const { return _flags[Store]; }

--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -154,7 +154,8 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
    if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
         ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
         ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat() ||
-         ii->isFlatGlobal() || ii->isSleep() || ii->isLocalMem())) {
+         ii->isFlatGlobal() || ii->isFlatScratch() || ii->isSleep() ||
+         ii->isLocalMem())) {
        panic("next instruction: %s is of unknown type\n", ii->disassemble());
    }