diff --git a/src/arch/amdgpu/vega/insts/op_encodings.cc b/src/arch/amdgpu/vega/insts/op_encodings.cc index cc650fbbd0..c934094d9b 100644 --- a/src/arch/amdgpu/vega/insts/op_encodings.cc +++ b/src/arch/amdgpu/vega/insts/op_encodings.cc @@ -1546,6 +1546,8 @@ namespace VegaISA // The SEG field specifies FLAT(0) SCRATCH(1) or GLOBAL(2) if (iFmt->SEG == 0) { setFlag(Flat); + } else if (iFmt->SEG == 1) { + setFlag(FlatScratch); } else if (iFmt->SEG == 2) { setFlag(FlatGlobal); } else { @@ -1573,12 +1575,12 @@ namespace VegaISA Inst_FLAT::initOperandInfo() { // One of the flat subtypes should be specified via flags - assert(isFlat() ^ isFlatGlobal()); + assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch()); if (isFlat()) { initFlatOperandInfo(); - } else if (isFlatGlobal()) { - initGlobalOperandInfo(); + } else if (isFlatGlobal() || isFlatScratch()) { + initGlobalScratchOperandInfo(); } else { panic("Unknown flat subtype!\n"); } @@ -1622,7 +1624,7 @@ namespace VegaISA } void - Inst_FLAT::initGlobalOperandInfo() + Inst_FLAT::initGlobalScratchOperandInfo() { //3 formats: // 1 dst + 2 src (load) @@ -1691,12 +1693,12 @@ namespace VegaISA Inst_FLAT::generateDisassembly() { // One of the flat subtypes should be specified via flags - assert(isFlat() ^ isFlatGlobal()); + assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch()); if (isFlat()) { generateFlatDisassembly(); - } else if (isFlatGlobal()) { - generateGlobalDisassembly(); + } else if (isFlatGlobal() || isFlatScratch()) { + generateGlobalScratchDisassembly(); } else { panic("Unknown flat subtype!\n"); } @@ -1720,11 +1722,16 @@ namespace VegaISA } void - Inst_FLAT::generateGlobalDisassembly() + Inst_FLAT::generateGlobalScratchDisassembly() { // Replace flat_ with global_ in assembly string std::string global_opcode = _opcode; - global_opcode.replace(0, 4, "global"); + if (isFlatGlobal()) { + global_opcode.replace(0, 4, "global"); + } else { + assert(isFlatScratch()); + global_opcode.replace(0, 4, "scratch"); + } std::stringstream dis_stream; dis_stream << global_opcode << " "; diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh index 0f5f502add..613d78b25e 100644 --- a/src/arch/amdgpu/vega/insts/op_encodings.hh +++ b/src/arch/amdgpu/vega/insts/op_encodings.hh @@ -926,7 +926,8 @@ namespace VegaISA void initMemRead(GPUDynInstPtr gpuDynInst) { - if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + if (gpuDynInst->executedAs() == enums::SC_GLOBAL || + gpuDynInst->executedAs() == enums::SC_PRIVATE) { initMemReqHelper(gpuDynInst, MemCmd::ReadReq); } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { Wavefront *wf = gpuDynInst->wavefront(); @@ -944,7 +945,8 @@ namespace VegaISA void initMemRead(GPUDynInstPtr gpuDynInst) { - if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + if (gpuDynInst->executedAs() == enums::SC_GLOBAL || + gpuDynInst->executedAs() == enums::SC_PRIVATE) { initMemReqHelper(gpuDynInst, MemCmd::ReadReq); } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { Wavefront *wf = gpuDynInst->wavefront(); @@ -966,7 +968,8 @@ namespace VegaISA void initMemWrite(GPUDynInstPtr gpuDynInst) { - if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + if (gpuDynInst->executedAs() == enums::SC_GLOBAL || + gpuDynInst->executedAs() == enums::SC_PRIVATE) { initMemReqHelper(gpuDynInst, MemCmd::WriteReq); } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { Wavefront *wf = gpuDynInst->wavefront(); @@ -984,7 +987,8 @@ namespace VegaISA void initMemWrite(GPUDynInstPtr gpuDynInst) { - if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + if (gpuDynInst->executedAs() == enums::SC_GLOBAL || + gpuDynInst->executedAs() == enums::SC_PRIVATE) { initMemReqHelper(gpuDynInst, MemCmd::WriteReq); } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { Wavefront *wf = gpuDynInst->wavefront(); @@ -1006,6 +1010,10 @@ namespace VegaISA void initAtomicAccess(GPUDynInstPtr gpuDynInst) { + // Flat scratch requests may not be atomic according to ISA manual + // up to MI200. See MI200 manual Table 45. + assert(gpuDynInst->executedAs() != enums::SC_PRIVATE); + if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { initMemReqHelper(gpuDynInst, MemCmd::SwapReq, true); } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { @@ -1044,7 +1052,8 @@ namespace VegaISA // If saddr = 0x7f there is no scalar reg to read and address will // be a 64-bit address. Otherwise, saddr is the reg index for a // scalar reg used as the base address for a 32-bit address. - if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) { + if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch())) + || isFlat()) { ConstVecOperandU64 vbase(gpuDynInst, vaddr); vbase.read(); @@ -1063,9 +1072,13 @@ namespace VegaISA if (isFlat()) { gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask); - } else { + } else if (isFlatGlobal()) { gpuDynInst->staticInstruction()->executed_as = enums::SC_GLOBAL; + } else { + assert(isFlatScratch()); + gpuDynInst->staticInstruction()->executed_as = + enums::SC_PRIVATE; } } @@ -1081,7 +1094,9 @@ namespace VegaISA gpuDynInst->computeUnit()->localMemoryPipe .issueRequest(gpuDynInst); } else { - fatal("Unsupported scope for flat instruction.\n"); + assert(gpuDynInst->executedAs() == enums::SC_PRIVATE); + gpuDynInst->computeUnit()->globalMemoryPipe + .issueRequest(gpuDynInst); } } @@ -1098,10 +1113,10 @@ namespace VegaISA private: void initFlatOperandInfo(); - void initGlobalOperandInfo(); + void initGlobalScratchOperandInfo(); void generateFlatDisassembly(); - void generateGlobalDisassembly(); + void generateGlobalScratchDisassembly(); void calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr, diff --git a/src/gpu-compute/GPUStaticInstFlags.py b/src/gpu-compute/GPUStaticInstFlags.py index b75e2c6c92..3a44d402be 100644 --- a/src/gpu-compute/GPUStaticInstFlags.py +++ b/src/gpu-compute/GPUStaticInstFlags.py @@ -54,6 +54,7 @@ class GPUStaticInstFlags(Enum): "MemoryRef", # References memory (load, store, or atomic) "Flat", # Flat memory op "FlatGlobal", # Global memory op + "FlatScratch", # Scratch memory op "Load", # Reads from memory "Store", # Writes to memory # Atomic ops diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index ea903455d5..8d6deeb85a 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -1917,6 +1917,8 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst) } } else if (gpuDynInst->isFlatGlobal()) { stats.flatVMemInsts++; + } else if (gpuDynInst->isFlatScratch()) { + stats.flatVMemInsts++; } else if (gpuDynInst->isLocalMem()) { stats.ldsNoFlatInsts++; } else if (gpuDynInst->isLoad()) { diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 3cbb6f1ff8..0b394e7e36 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -432,6 +432,12 @@ GPUDynInst::isFlatGlobal() const return _staticInst->isFlatGlobal(); } +bool +GPUDynInst::isFlatScratch() const +{ + return _staticInst->isFlatScratch(); +} + bool GPUDynInst::isLoad() const { @@ -901,12 +907,12 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask) uint32_t numSgprs = wavefront()->maxSgprs; uint32_t physSgprIdx = wavefront()->computeUnit->registerManager->mapSgpr(wavefront(), - numSgprs - 3); + numSgprs - 4); uint32_t offset = wavefront()->computeUnit->srf[simdId]->read(physSgprIdx); physSgprIdx = wavefront()->computeUnit->registerManager->mapSgpr(wavefront(), - numSgprs - 4); + numSgprs - 3); uint32_t size = wavefront()->computeUnit->srf[simdId]->read(physSgprIdx); for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) { @@ -919,12 +925,12 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask) wavefront()->execUnitId = wavefront()->flatLmUnitId; wavefront()->decLGKMInstsIssued(); if (isLoad()) { - wavefront()->rdGmReqsInPipe--; + wavefront()->rdLmReqsInPipe--; } else if (isStore()) { - wavefront()->wrGmReqsInPipe--; + wavefront()->wrLmReqsInPipe--; } else if (isAtomic() || isMemSync()) { - wavefront()->rdGmReqsInPipe--; - wavefront()->wrGmReqsInPipe--; + wavefront()->wrLmReqsInPipe--; + wavefront()->rdLmReqsInPipe--; } else { panic("Invalid memory operation!\n"); } diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index e2884a012a..558cce8431 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -234,6 +234,7 @@ class GPUDynInst : public GPUExecContext bool isMemRef() const; bool isFlat() const; bool isFlatGlobal() const; + bool isFlatScratch() const; bool isLoad() const; bool isStore() const; diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh index b86a507dce..156f0e529d 100644 --- a/src/gpu-compute/gpu_static_inst.hh +++ b/src/gpu-compute/gpu_static_inst.hh @@ -130,6 +130,7 @@ class GPUStaticInst : public GPUStaticInstFlags bool isMemRef() const { return _flags[MemoryRef]; } bool isFlat() const { return _flags[Flat]; } bool isFlatGlobal() const { return _flags[FlatGlobal]; } + bool isFlatScratch() const { return _flags[FlatScratch]; } bool isLoad() const { return _flags[Load]; } bool isStore() const { return _flags[Store]; } diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc index 3d18260822..b618cab278 100644 --- a/src/gpu-compute/scoreboard_check_stage.cc +++ b/src/gpu-compute/scoreboard_check_stage.cc @@ -154,7 +154,8 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus, if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() || ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat() || - ii->isFlatGlobal() || ii->isSleep() || ii->isLocalMem())) { + ii->isFlatGlobal() || ii->isFlatScratch() || ii->isSleep() || + ii->isLocalMem())) { panic("next instruction: %s is of unknown type\n", ii->disassemble()); }