arch-vega: Architected flat scratch and scratch insts

Architected flat scratch is added in MI300 which store the scratch base address in dedicated registers rather than in SGPRs. These registers are used by scratch_ instructions. These are flat instruction which explicitly target the private memory aperture. These instructions have a different address calculation than global_ instructions. This change implements architected flat scratch support, fixes the address calculation of scratch_ instructions, and implements decodings for some scratch_ instructions. Previous flat_ instructions which happen to access the private memory aperture have no change in address calculation. Since scratch_ instructions are identical to flat_ instruction except for address calculation, the decodings simply reuse existing flat_ instruction definitions. Change-Id: I1e1d15a2fbcc7a4a678157c35608f4f22b359e21
2024-05-15 12:00:47 -07:00
parent 8be5ce6fc9
commit c1803eafac
6 changed files with 160 additions and 47 deletions
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -910,35 +910,63 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
         *     #flat-addressing
         */

-        uint32_t numSgprs = wavefront()->maxSgprs;
-        uint32_t physSgprIdx =
-            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 4);
-        uint32_t offset =
-            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
-        physSgprIdx =
-            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 3);
-        uint32_t size =
-            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
-        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
-            if (mask[lane]) {
-                addr[lane] = addr[lane] + lane * size + offset +
-                    wavefront()->computeUnit->shader->getHiddenPrivateBase() -
-                    wavefront()->computeUnit->shader->getScratchBase();
+        ComputeUnit *cu = wavefront()->computeUnit;
+
+        if (wavefront()->gfxVersion == GfxVersion::gfx942) {
+            // Architected flat scratch base address in FLAT_SCRATCH registers
+            uint32_t fs_lo = cu->srf[simdId]->read(
+                VegaISA::REG_FLAT_SCRATCH_LO);
+            uint32_t fs_hi = cu->srf[simdId]->read(
+                VegaISA::REG_FLAT_SCRATCH_HI);
+
+            Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo;
+
+            for (int lane = 0; lane < cu->wfSize(); ++lane) {
+                if (mask[lane]) {
+                    // The scratch base is added for other gfx versions,
+                    // otherwise this would simply add the register base.
+                    addr[lane] = addr[lane] - cu->shader->getScratchBase()
+                        + arch_flat_scratch;
+                }
+            }
+        } else {
+            // In absolute flat scratch the program needs to place scratch
+            // address in SGPRn-3,4.
+            uint32_t numSgprs = wavefront()->maxSgprs;
+            uint32_t physSgprIdx =
+                cu->registerManager->mapSgpr(wavefront(), numSgprs - 4);
+            uint32_t offset = cu->srf[simdId]->read(physSgprIdx);
+            physSgprIdx =
+                cu->registerManager->mapSgpr(wavefront(), numSgprs - 3);
+            uint32_t size = cu->srf[simdId]->read(physSgprIdx);
+
+
+            for (int lane = 0; lane < cu->wfSize(); ++lane) {
+                if (mask[lane]) {
+                    addr[lane] = addr[lane] + lane * size + offset +
+                        cu->shader->getHiddenPrivateBase() -
+                        cu->shader->getScratchBase();
+                }
            }
        }
-        wavefront()->execUnitId =  wavefront()->flatLmUnitId;
-        wavefront()->decLGKMInstsIssued();
-        if (isLoad()) {
-            wavefront()->rdLmReqsInPipe--;
-        } else if (isStore()) {
-            wavefront()->wrLmReqsInPipe--;
-        } else if (isAtomic() || isMemSync()) {
-            wavefront()->wrLmReqsInPipe--;
-            wavefront()->rdLmReqsInPipe--;
-        } else {
-            panic("Invalid memory operation!\n");
+
+        wavefront()->execUnitId = wavefront()->flatLmUnitId;
+
+        // For FLAT the local memory pipe counters are incremented, but they
+        // are not incremented for explicit scratch_* instructions. Only
+        // decrement these counters if we are explicitly a FLAT instruction.
+        if (isFlat()) {
+            wavefront()->decLGKMInstsIssued();
+            if (isLoad()) {
+                wavefront()->rdLmReqsInPipe--;
+            } else if (isStore()) {
+                wavefront()->wrLmReqsInPipe--;
+            } else if (isAtomic() || isMemSync()) {
+                wavefront()->wrLmReqsInPipe--;
+                wavefront()->rdLmReqsInPipe--;
+            } else {
+                panic("Invalid memory operation!\n");
+            }
        }
    } else {
        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -179,7 +179,8 @@ class GPUStaticInst : public GPUStaticInstFlags
    {
        return _flags[MemoryRef] && (_flags[GlobalSegment] ||
               _flags[PrivateSegment] || _flags[ReadOnlySegment] ||
-               _flags[SpillSegment] || _flags[FlatGlobal]);
+               _flags[SpillSegment] || _flags[FlatGlobal] ||
+               _flags[FlatScratch]);
    }

    bool
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -118,6 +118,7 @@ void
 Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
 {
    int regInitIdx = 0;
+    gfxVersion = task->gfxVersion();

    // Iterate over all the init fields and check which
    // bits are enabled. Useful information can be found here:
@@ -378,8 +379,29 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
                        wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
                break;
              case PrivSegWaveByteOffset:
+
+                // For architected flat scratch, this enable is reused to set
+                // the FLAT_SCRATCH register pair to the scratch backing
+                // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
+                if (task->gfxVersion() == GfxVersion::gfx942) {
+                    Addr arch_flat_scratch =
+                        task->amdQueue.scratch_backing_memory_location;
+                    computeUnit->srf[simdId]->write(
+                        VegaISA::REG_FLAT_SCRATCH_HI,
+                        bits(arch_flat_scratch, 63, 32));
+                    computeUnit->srf[simdId]->write(
+                        VegaISA::REG_FLAT_SCRATCH_LO,
+                        bits(arch_flat_scratch, 31, 0));
+
+                    break;
+                }
+
+                // Not architected flat scratch. Write the scratch wavefront
+                // offset: https://llvm.org/docs/AMDGPUUsage.html
+                //              #amdgpu-amdhsa-initial-kernel-execution-state
                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
+
                /**
                  * the compute_tmpring_size_wavesize specifies the number of
                  * kB allocated per wavefront, hence the multiplication by
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -92,6 +92,8 @@ class Wavefront : public SimObject
        S_BARRIER
    };

+    // gfx version wavefront is executing
+    GfxVersion gfxVersion;
    // HW slot id where the WF is mapped to inside a SIMD unit
    const int wfSlotId;
    int kernId;