diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc index 6f34301f48..eb5a5bb309 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.cc +++ b/src/arch/amdgpu/vega/gpu_decoder.cc @@ -9922,29 +9922,25 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORD(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_LOAD_DWORD(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX2(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_LOAD_DWORDX2(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX3(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_LOAD_DWORDX3(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX4(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_LOAD_DWORDX4(&iFmt->iFmt_FLAT); } GPUStaticInst* @@ -9977,29 +9973,25 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORD(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX2(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_STORE_DWORDX2(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX3(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_STORE_DWORDX3(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX4(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_STORE_DWORDX4(&iFmt->iFmt_FLAT); } GPUStaticInst* diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh index 9ab7b84974..5861f296ff 100644 --- a/src/arch/amdgpu/vega/insts/op_encodings.hh +++ b/src/arch/amdgpu/vega/insts/op_encodings.hh @@ -1258,13 +1258,12 @@ namespace VegaISA // If saddr = 0x7f there is no scalar reg to read and address will // be a 64-bit address. Otherwise, saddr is the reg index for a // scalar reg used as the base address for a 32-bit address. - if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch())) - || isFlat()) { + if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) { ConstVecOperandU64 vbase(gpuDynInst, vaddr); vbase.read(); calcAddrVgpr(gpuDynInst, vbase, offset); - } else { + } else if (isFlatGlobal()) { // Assume we are operating in 64-bit mode and read a pair of // SGPRs for the address base. ConstScalarOperandU64 sbase(gpuDynInst, saddr); @@ -1274,6 +1273,57 @@ namespace VegaISA voffset.read(); calcAddrSgpr(gpuDynInst, voffset, sbase, offset); + // For scratch, saddr = 0x7f there is no scalar reg to read and + // a vgpr will be used for address offset. Otherwise, saddr is + // the sgpr index holding the address offset. For scratch + // instructions the offset GPR is always 32-bits. + } else if (saddr != 0x7f) { + assert(isFlatScratch()); + + ConstScalarOperandU32 soffset(gpuDynInst, saddr); + soffset.read(); + + Addr flat_scratch_addr = readFlatScratch(gpuDynInst); + + int elemSize; + auto staticInst = gpuDynInst->staticInstruction(); + if (gpuDynInst->isLoad()) { + elemSize = staticInst->getOperandSize(2); + } else { + assert(gpuDynInst->isStore()); + elemSize = staticInst->getOperandSize(1); + } + + unsigned swizzleOffset = soffset.rawData() + offset; + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + gpuDynInst->addr.at(lane) = flat_scratch_addr + + swizzle(swizzleOffset, lane, elemSize); + } + } + } else { + assert(isFlatScratch()); + + ConstVecOperandU32 voffset(gpuDynInst, vaddr); + voffset.read(); + + Addr flat_scratch_addr = readFlatScratch(gpuDynInst); + + int elemSize; + auto staticInst = gpuDynInst->staticInstruction(); + if (gpuDynInst->isLoad()) { + elemSize = staticInst->getOperandSize(2); + } else { + assert(gpuDynInst->isStore()); + elemSize = staticInst->getOperandSize(1); + } + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + gpuDynInst->addr.at(lane) = flat_scratch_addr + + swizzle(voffset[lane] + offset, lane, elemSize); + } + } } if (isFlat()) { @@ -1285,6 +1335,7 @@ namespace VegaISA assert(isFlatScratch()); gpuDynInst->staticInstruction()->executed_as = enums::SC_PRIVATE; + gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask); } } @@ -1421,6 +1472,23 @@ namespace VegaISA } } } + + VecElemU32 + swizzle(VecElemU32 offset, int lane, int elem_size) + { + // This is not described in the spec. We use the swizzle from + // buffer memory instructions and fix the stride to 4. Multiply + // the thread ID by the storage size to avoid threads clobbering + // their data. + return ((offset / 4) * 4 * 64) + + (offset % 4) + (lane * elem_size); + } + + Addr + readFlatScratch(GPUDynInstPtr gpuDynInst) + { + return gpuDynInst->computeUnit()->shader->getScratchBase(); + } }; // Inst_FLAT } // namespace VegaISA } // namespace gem5 diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 66b2b8ec49..80f18d2fa2 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -910,35 +910,63 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask) * #flat-addressing */ - uint32_t numSgprs = wavefront()->maxSgprs; - uint32_t physSgprIdx = - wavefront()->computeUnit->registerManager->mapSgpr(wavefront(), - numSgprs - 4); - uint32_t offset = - wavefront()->computeUnit->srf[simdId]->read(physSgprIdx); - physSgprIdx = - wavefront()->computeUnit->registerManager->mapSgpr(wavefront(), - numSgprs - 3); - uint32_t size = - wavefront()->computeUnit->srf[simdId]->read(physSgprIdx); - for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) { - if (mask[lane]) { - addr[lane] = addr[lane] + lane * size + offset + - wavefront()->computeUnit->shader->getHiddenPrivateBase() - - wavefront()->computeUnit->shader->getScratchBase(); + ComputeUnit *cu = wavefront()->computeUnit; + + if (wavefront()->gfxVersion == GfxVersion::gfx942) { + // Architected flat scratch base address in FLAT_SCRATCH registers + uint32_t fs_lo = cu->srf[simdId]->read( + VegaISA::REG_FLAT_SCRATCH_LO); + uint32_t fs_hi = cu->srf[simdId]->read( + VegaISA::REG_FLAT_SCRATCH_HI); + + Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo; + + for (int lane = 0; lane < cu->wfSize(); ++lane) { + if (mask[lane]) { + // The scratch base is added for other gfx versions, + // otherwise this would simply add the register base. + addr[lane] = addr[lane] - cu->shader->getScratchBase() + + arch_flat_scratch; + } + } + } else { + // In absolute flat scratch the program needs to place scratch + // address in SGPRn-3,4. + uint32_t numSgprs = wavefront()->maxSgprs; + uint32_t physSgprIdx = + cu->registerManager->mapSgpr(wavefront(), numSgprs - 4); + uint32_t offset = cu->srf[simdId]->read(physSgprIdx); + physSgprIdx = + cu->registerManager->mapSgpr(wavefront(), numSgprs - 3); + uint32_t size = cu->srf[simdId]->read(physSgprIdx); + + + for (int lane = 0; lane < cu->wfSize(); ++lane) { + if (mask[lane]) { + addr[lane] = addr[lane] + lane * size + offset + + cu->shader->getHiddenPrivateBase() - + cu->shader->getScratchBase(); + } } } - wavefront()->execUnitId = wavefront()->flatLmUnitId; - wavefront()->decLGKMInstsIssued(); - if (isLoad()) { - wavefront()->rdLmReqsInPipe--; - } else if (isStore()) { - wavefront()->wrLmReqsInPipe--; - } else if (isAtomic() || isMemSync()) { - wavefront()->wrLmReqsInPipe--; - wavefront()->rdLmReqsInPipe--; - } else { - panic("Invalid memory operation!\n"); + + wavefront()->execUnitId = wavefront()->flatLmUnitId; + + // For FLAT the local memory pipe counters are incremented, but they + // are not incremented for explicit scratch_* instructions. Only + // decrement these counters if we are explicitly a FLAT instruction. + if (isFlat()) { + wavefront()->decLGKMInstsIssued(); + if (isLoad()) { + wavefront()->rdLmReqsInPipe--; + } else if (isStore()) { + wavefront()->wrLmReqsInPipe--; + } else if (isAtomic() || isMemSync()) { + wavefront()->wrLmReqsInPipe--; + wavefront()->rdLmReqsInPipe--; + } else { + panic("Invalid memory operation!\n"); + } } } else { for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) { diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh index 6132ab2d29..1ec06dc7d3 100644 --- a/src/gpu-compute/gpu_static_inst.hh +++ b/src/gpu-compute/gpu_static_inst.hh @@ -179,7 +179,8 @@ class GPUStaticInst : public GPUStaticInstFlags { return _flags[MemoryRef] && (_flags[GlobalSegment] || _flags[PrivateSegment] || _flags[ReadOnlySegment] || - _flags[SpillSegment] || _flags[FlatGlobal]); + _flags[SpillSegment] || _flags[FlatGlobal] || + _flags[FlatScratch]); } bool diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index b5298bad4c..de7c2333c2 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -118,6 +118,7 @@ void Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems) { int regInitIdx = 0; + gfxVersion = task->gfxVersion(); // Iterate over all the init fields and check which // bits are enabled. Useful information can be found here: @@ -378,8 +379,29 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems) wfSlotId, wfDynId, physSgprIdx, workGroupId[2]); break; case PrivSegWaveByteOffset: + + // For architected flat scratch, this enable is reused to set + // the FLAT_SCRATCH register pair to the scratch backing + // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch + if (task->gfxVersion() == GfxVersion::gfx942) { + Addr arch_flat_scratch = + task->amdQueue.scratch_backing_memory_location; + computeUnit->srf[simdId]->write( + VegaISA::REG_FLAT_SCRATCH_HI, + bits(arch_flat_scratch, 63, 32)); + computeUnit->srf[simdId]->write( + VegaISA::REG_FLAT_SCRATCH_LO, + bits(arch_flat_scratch, 31, 0)); + + break; + } + + // Not architected flat scratch. Write the scratch wavefront + // offset: https://llvm.org/docs/AMDGPUUsage.html + // #amdgpu-amdhsa-initial-kernel-execution-state physSgprIdx = computeUnit->registerManager->mapSgpr(this, regInitIdx); + /** * the compute_tmpring_size_wavesize specifies the number of * kB allocated per wavefront, hence the multiplication by diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh index 82035f7d47..b7dff4617b 100644 --- a/src/gpu-compute/wavefront.hh +++ b/src/gpu-compute/wavefront.hh @@ -92,6 +92,8 @@ class Wavefront : public SimObject S_BARRIER }; + // gfx version wavefront is executing + GfxVersion gfxVersion; // HW slot id where the WF is mapped to inside a SIMD unit const int wfSlotId; int kernId;