From e0379f45262fc582793ed29e88bd6d69a8241196 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Sat, 26 Aug 2023 13:06:54 -0500 Subject: [PATCH 1/3] gpu-compute: Fix flat scratch resource counters Flat instructions may access memory locations in LDS (scratchpad) and global (VRAM/framebuffer) and therefore increment both counters when dispatched. Once the aperture is known, we decrement the counters of the aperture that was *not* used. This is done incorrectly for scratch / private flat instruction. Private memory is global and therefore local memory counters should be decremented. This commit fixes the counters by changing the global decrements to local decrements. Change-Id: I25890446908df72e5469e9dbaba6c984955196cf --- src/gpu-compute/gpu_dyn_inst.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 3cbb6f1ff8..991ba2c07a 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -919,12 +919,12 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask) wavefront()->execUnitId = wavefront()->flatLmUnitId; wavefront()->decLGKMInstsIssued(); if (isLoad()) { - wavefront()->rdGmReqsInPipe--; + wavefront()->rdLmReqsInPipe--; } else if (isStore()) { - wavefront()->wrGmReqsInPipe--; + wavefront()->wrLmReqsInPipe--; } else if (isAtomic() || isMemSync()) { - wavefront()->rdGmReqsInPipe--; - wavefront()->wrGmReqsInPipe--; + wavefront()->wrLmReqsInPipe--; + wavefront()->rdLmReqsInPipe--; } else { panic("Invalid memory operation!\n"); } From 4506188e003c7f67e1323f1330da239f22441da7 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Sat, 26 Aug 2023 13:09:28 -0500 Subject: [PATCH 2/3] gpu-compute: Fix private offset/size register indexes According to the ABI documentation from LLVM, the *low* register of flat scratch (maxSGPR - 4) is the offset and the high register (maxSGPR - 3) is size. These are currently backwards, resulting in some gnarly addresses being generated leading to page fault and/or incorrect data. This commit fixes this by setting the order correctly. Change-Id: I0b1d077c49c0ee2a4e59b0f6d85cdb8f17f9be61 --- src/gpu-compute/gpu_dyn_inst.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 991ba2c07a..8dfae1dc90 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -901,12 +901,12 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask) uint32_t numSgprs = wavefront()->maxSgprs; uint32_t physSgprIdx = wavefront()->computeUnit->registerManager->mapSgpr(wavefront(), - numSgprs - 3); + numSgprs - 4); uint32_t offset = wavefront()->computeUnit->srf[simdId]->read(physSgprIdx); physSgprIdx = wavefront()->computeUnit->registerManager->mapSgpr(wavefront(), - numSgprs - 4); + numSgprs - 3); uint32_t size = wavefront()->computeUnit->srf[simdId]->read(physSgprIdx); for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) { From 60f071d09a03ced31c2f1c7496222b3007a33bae Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Sat, 26 Aug 2023 13:14:13 -0500 Subject: [PATCH 3/3] gpu-compute,arch-vega: Implement flat scratch insts Flat scratch instructions (aka private) are the 3rd and final segment of flat instructions in gfx9 (Vega) and beyond. These are used for things like spills/fills and thread local storage. This commit enables two forms of flat scratch instructions: (1) flat_load/flat_store instructions where the memory address resolves to private memory and (2) the new scratch_load/scratch_store instructions in Vega. The first are similar to older generation ISAs where the aperture is unknown until address translation. The second are instructions guaranteed to go to private memory. Since these are very similar to flat global instructions there are minimal changes needed: - Ensure a flat instruction is either regular flat, global, XOR scratch - Rename the global op_encoding methods to GlobalScratch to indicate they are for both and are intentionally used. - Flat instructions in segment 1 output scratch_ in the disassembly - Flat instruction executed as private use similar mem helpers as global - Flat scratch cannot be an atomic This was tested using a modified version of the 'square' application: template __global__ void scratch_square(T *C_d, T *A_d, size_t N) { size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); size_t stride = blockDim.x * gridDim.x ; volatile int foo; // Volatile ensures scratch / unoptimized code for (size_t i=offset; iSEG == 0) { setFlag(Flat); + } else if (iFmt->SEG == 1) { + setFlag(FlatScratch); } else if (iFmt->SEG == 2) { setFlag(FlatGlobal); } else { @@ -1573,12 +1575,12 @@ namespace VegaISA Inst_FLAT::initOperandInfo() { // One of the flat subtypes should be specified via flags - assert(isFlat() ^ isFlatGlobal()); + assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch()); if (isFlat()) { initFlatOperandInfo(); - } else if (isFlatGlobal()) { - initGlobalOperandInfo(); + } else if (isFlatGlobal() || isFlatScratch()) { + initGlobalScratchOperandInfo(); } else { panic("Unknown flat subtype!\n"); } @@ -1622,7 +1624,7 @@ namespace VegaISA } void - Inst_FLAT::initGlobalOperandInfo() + Inst_FLAT::initGlobalScratchOperandInfo() { //3 formats: // 1 dst + 2 src (load) @@ -1691,12 +1693,12 @@ namespace VegaISA Inst_FLAT::generateDisassembly() { // One of the flat subtypes should be specified via flags - assert(isFlat() ^ isFlatGlobal()); + assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch()); if (isFlat()) { generateFlatDisassembly(); - } else if (isFlatGlobal()) { - generateGlobalDisassembly(); + } else if (isFlatGlobal() || isFlatScratch()) { + generateGlobalScratchDisassembly(); } else { panic("Unknown flat subtype!\n"); } @@ -1720,11 +1722,16 @@ namespace VegaISA } void - Inst_FLAT::generateGlobalDisassembly() + Inst_FLAT::generateGlobalScratchDisassembly() { // Replace flat_ with global_ in assembly string std::string global_opcode = _opcode; - global_opcode.replace(0, 4, "global"); + if (isFlatGlobal()) { + global_opcode.replace(0, 4, "global"); + } else { + assert(isFlatScratch()); + global_opcode.replace(0, 4, "scratch"); + } std::stringstream dis_stream; dis_stream << global_opcode << " "; diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh index 0f5f502add..613d78b25e 100644 --- a/src/arch/amdgpu/vega/insts/op_encodings.hh +++ b/src/arch/amdgpu/vega/insts/op_encodings.hh @@ -926,7 +926,8 @@ namespace VegaISA void initMemRead(GPUDynInstPtr gpuDynInst) { - if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + if (gpuDynInst->executedAs() == enums::SC_GLOBAL || + gpuDynInst->executedAs() == enums::SC_PRIVATE) { initMemReqHelper(gpuDynInst, MemCmd::ReadReq); } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { Wavefront *wf = gpuDynInst->wavefront(); @@ -944,7 +945,8 @@ namespace VegaISA void initMemRead(GPUDynInstPtr gpuDynInst) { - if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + if (gpuDynInst->executedAs() == enums::SC_GLOBAL || + gpuDynInst->executedAs() == enums::SC_PRIVATE) { initMemReqHelper(gpuDynInst, MemCmd::ReadReq); } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { Wavefront *wf = gpuDynInst->wavefront(); @@ -966,7 +968,8 @@ namespace VegaISA void initMemWrite(GPUDynInstPtr gpuDynInst) { - if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + if (gpuDynInst->executedAs() == enums::SC_GLOBAL || + gpuDynInst->executedAs() == enums::SC_PRIVATE) { initMemReqHelper(gpuDynInst, MemCmd::WriteReq); } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { Wavefront *wf = gpuDynInst->wavefront(); @@ -984,7 +987,8 @@ namespace VegaISA void initMemWrite(GPUDynInstPtr gpuDynInst) { - if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + if (gpuDynInst->executedAs() == enums::SC_GLOBAL || + gpuDynInst->executedAs() == enums::SC_PRIVATE) { initMemReqHelper(gpuDynInst, MemCmd::WriteReq); } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { Wavefront *wf = gpuDynInst->wavefront(); @@ -1006,6 +1010,10 @@ namespace VegaISA void initAtomicAccess(GPUDynInstPtr gpuDynInst) { + // Flat scratch requests may not be atomic according to ISA manual + // up to MI200. See MI200 manual Table 45. + assert(gpuDynInst->executedAs() != enums::SC_PRIVATE); + if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { initMemReqHelper(gpuDynInst, MemCmd::SwapReq, true); } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { @@ -1044,7 +1052,8 @@ namespace VegaISA // If saddr = 0x7f there is no scalar reg to read and address will // be a 64-bit address. Otherwise, saddr is the reg index for a // scalar reg used as the base address for a 32-bit address. - if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) { + if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch())) + || isFlat()) { ConstVecOperandU64 vbase(gpuDynInst, vaddr); vbase.read(); @@ -1063,9 +1072,13 @@ namespace VegaISA if (isFlat()) { gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask); - } else { + } else if (isFlatGlobal()) { gpuDynInst->staticInstruction()->executed_as = enums::SC_GLOBAL; + } else { + assert(isFlatScratch()); + gpuDynInst->staticInstruction()->executed_as = + enums::SC_PRIVATE; } } @@ -1081,7 +1094,9 @@ namespace VegaISA gpuDynInst->computeUnit()->localMemoryPipe .issueRequest(gpuDynInst); } else { - fatal("Unsupported scope for flat instruction.\n"); + assert(gpuDynInst->executedAs() == enums::SC_PRIVATE); + gpuDynInst->computeUnit()->globalMemoryPipe + .issueRequest(gpuDynInst); } } @@ -1098,10 +1113,10 @@ namespace VegaISA private: void initFlatOperandInfo(); - void initGlobalOperandInfo(); + void initGlobalScratchOperandInfo(); void generateFlatDisassembly(); - void generateGlobalDisassembly(); + void generateGlobalScratchDisassembly(); void calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr, diff --git a/src/gpu-compute/GPUStaticInstFlags.py b/src/gpu-compute/GPUStaticInstFlags.py index b75e2c6c92..3a44d402be 100644 --- a/src/gpu-compute/GPUStaticInstFlags.py +++ b/src/gpu-compute/GPUStaticInstFlags.py @@ -54,6 +54,7 @@ class GPUStaticInstFlags(Enum): "MemoryRef", # References memory (load, store, or atomic) "Flat", # Flat memory op "FlatGlobal", # Global memory op + "FlatScratch", # Scratch memory op "Load", # Reads from memory "Store", # Writes to memory # Atomic ops diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index ea903455d5..8d6deeb85a 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -1917,6 +1917,8 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst) } } else if (gpuDynInst->isFlatGlobal()) { stats.flatVMemInsts++; + } else if (gpuDynInst->isFlatScratch()) { + stats.flatVMemInsts++; } else if (gpuDynInst->isLocalMem()) { stats.ldsNoFlatInsts++; } else if (gpuDynInst->isLoad()) { diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 8dfae1dc90..0b394e7e36 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -432,6 +432,12 @@ GPUDynInst::isFlatGlobal() const return _staticInst->isFlatGlobal(); } +bool +GPUDynInst::isFlatScratch() const +{ + return _staticInst->isFlatScratch(); +} + bool GPUDynInst::isLoad() const { diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index e2884a012a..558cce8431 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -234,6 +234,7 @@ class GPUDynInst : public GPUExecContext bool isMemRef() const; bool isFlat() const; bool isFlatGlobal() const; + bool isFlatScratch() const; bool isLoad() const; bool isStore() const; diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh index b86a507dce..156f0e529d 100644 --- a/src/gpu-compute/gpu_static_inst.hh +++ b/src/gpu-compute/gpu_static_inst.hh @@ -130,6 +130,7 @@ class GPUStaticInst : public GPUStaticInstFlags bool isMemRef() const { return _flags[MemoryRef]; } bool isFlat() const { return _flags[Flat]; } bool isFlatGlobal() const { return _flags[FlatGlobal]; } + bool isFlatScratch() const { return _flags[FlatScratch]; } bool isLoad() const { return _flags[Load]; } bool isStore() const { return _flags[Store]; } diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc index 3d18260822..b618cab278 100644 --- a/src/gpu-compute/scoreboard_check_stage.cc +++ b/src/gpu-compute/scoreboard_check_stage.cc @@ -154,7 +154,8 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus, if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() || ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat() || - ii->isFlatGlobal() || ii->isSleep() || ii->isLocalMem())) { + ii->isFlatGlobal() || ii->isFlatScratch() || ii->isSleep() || + ii->isLocalMem())) { panic("next instruction: %s is of unknown type\n", ii->disassemble()); }