From 1dab4be002955a83a3e38f2a3a236df888284ded Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Sat, 15 Jun 2024 13:44:36 -0700 Subject: [PATCH 1/4] arch-vega: Implement VOP3 V_FMAC_F32 A version of V_FMAC_F32 with extra modifiers from VOP3 format. Change-Id: Ib6b41b0a3ceb91269b91a0287dfc94bc73e4d217 --- src/arch/amdgpu/vega/gpu_decoder.cc | 8 ++- src/arch/amdgpu/vega/gpu_decoder.hh | 1 + src/arch/amdgpu/vega/insts/instructions.hh | 34 +++++++++++ src/arch/amdgpu/vega/insts/vop3.cc | 67 ++++++++++++++++++++++ 4 files changed, 109 insertions(+), 1 deletion(-) diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc index e07a392ced..43c33e44cc 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.cc +++ b/src/arch/amdgpu/vega/gpu_decoder.cc @@ -886,7 +886,7 @@ namespace VegaISA &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, - &Decoder::decode_invalid, + &Decoder::decode_OPU_VOP3__V_FMAC_F32, &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, @@ -6172,6 +6172,12 @@ namespace VegaISA return new Inst_VOP3__V_SUBREV_U32(&iFmt->iFmt_VOP3A); } // decode_OPU_VOP3__V_SUBREV_U32 + GPUStaticInst* + Decoder::decode_OPU_VOP3__V_FMAC_F32(MachInst iFmt) + { + return new Inst_VOP3__V_FMAC_F32(&iFmt->iFmt_VOP3A); + } // decode_OPU_VOP3__V_FMAC_F32 + GPUStaticInst* Decoder::decode_OPU_VOP3__V_NOP(MachInst iFmt) { diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh index 2523734ce5..e3b9c20e1f 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.hh +++ b/src/arch/amdgpu/vega/gpu_decoder.hh @@ -325,6 +325,7 @@ namespace VegaISA GPUStaticInst* decode_OPU_VOP3__V_ADD_U32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_SUB_U32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U32(MachInst); + GPUStaticInst* decode_OPU_VOP3__V_FMAC_F32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_NOP(MachInst); GPUStaticInst* decode_OPU_VOP3__V_MOV_B32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_CVT_I32_F64(MachInst); diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index 21984a9bbd..a979c1e492 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -25950,6 +25950,40 @@ namespace VegaISA void execute(GPUDynInstPtr) override; }; // Inst_VOP3__V_SUBREV_U32 + class Inst_VOP3__V_FMAC_F32 : public Inst_VOP3A + { + public: + Inst_VOP3__V_FMAC_F32(InFmt_VOP3A*); + ~Inst_VOP3__V_FMAC_F32(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 2; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //src_0 + return 4; + case 1: //src_1 + return 4; + case 2: //vdst + return 4; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP3__V_FMAC_F32 + class Inst_VOP3__V_NOP : public Inst_VOP3A { public: diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc index 47665ad353..b9fee17353 100644 --- a/src/arch/amdgpu/vega/insts/vop3.cc +++ b/src/arch/amdgpu/vega/insts/vop3.cc @@ -2404,6 +2404,73 @@ namespace VegaISA vdst.write(); } // execute + // --- Inst_VOP3__V_FMAC_F32 class methods --- + + Inst_VOP3__V_FMAC_F32::Inst_VOP3__V_FMAC_F32(InFmt_VOP3A *iFmt) + : Inst_VOP3A(iFmt, "v_fmac_f32", false) + { + setFlag(ALU); + setFlag(F32); + setFlag(FMA); + } // Inst_VOP3__V_FMAC_F32 + + Inst_VOP3__V_FMAC_F32::~Inst_VOP3__V_FMAC_F32() + { + } // ~Inst_VOP3__V_FMAC_F32 + + // --- description from .arch file --- + // D.f = S0.f * S1.f + D.f. + void + Inst_VOP3__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandF32 src0(gpuDynInst, extData.SRC0); + ConstVecOperandF32 src1(gpuDynInst, extData.SRC1); + VecOperandF32 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.readSrc(); + vdst.read(); + + panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode); + panic_if(isDPPInst(), "DPP not implemented for %s", _opcode); + panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode); + + if (instData.ABS & 0x1) { + src0.absModifier(); + } + + if (instData.ABS & 0x2) { + src1.absModifier(); + } + + if (instData.ABS & 0x4) { + vdst.absModifier(); + } + + if (extData.NEG & 0x1) { + src0.negModifier(); + } + + if (extData.NEG & 0x2) { + src1.negModifier(); + } + + if (extData.NEG & 0x4) { + vdst.negModifier(); + } + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + float out = std::fma(src0[lane], src1[lane], vdst[lane]); + out = omodModifier(out, extData.OMOD); + out = std::clamp(vdst[lane], 0.0f, 1.0f); + vdst[lane] = out; + } + } + + vdst.write(); + } // execute // --- Inst_VOP3__V_NOP class methods --- Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt) From 42369eab2cdd1bdb62b22cdcbdf3ab17b23735b0 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Sat, 15 Jun 2024 13:48:57 -0700 Subject: [PATCH 2/4] arch-vega: Implement MI300 FLAT SVE bit For scratch instructions only, this bit specifies if an offset in a VGPR should be used for address calculation. This is new in MI300 and was previously the LDS bit. The LDS bit is rarely used and in fact gem5 does not even check this bit. This fixes a bug when SADDR == 0x7f (i.e., no SGPR should be used) where a VGPR was being added to the address when it should have been ignored. Change-Id: I9864379692df6795b25b58b98825da05d18fc5db --- src/arch/amdgpu/vega/gpu_decoder.hh | 2 +- src/arch/amdgpu/vega/insts/op_encodings.hh | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh index e3b9c20e1f..285377ad3d 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.hh +++ b/src/arch/amdgpu/vega/gpu_decoder.hh @@ -1714,7 +1714,7 @@ namespace VegaISA struct InFmt_FLAT { unsigned int OFFSET : 13; - unsigned int LDS : 1; + unsigned int SVE : 1; unsigned int SEG : 2; unsigned int GLC : 1; unsigned int SLC : 1; diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh index 3c5804526a..504946534f 100644 --- a/src/arch/amdgpu/vega/insts/op_encodings.hh +++ b/src/arch/amdgpu/vega/insts/op_encodings.hh @@ -1306,6 +1306,11 @@ namespace VegaISA ConstScalarOperandU32 soffset(gpuDynInst, saddr); soffset.read(); + ConstVecOperandU32 voffset(gpuDynInst, vaddr); + if (instData.SVE) { + voffset.read(); + } + Addr flat_scratch_addr = readFlatScratch(gpuDynInst); int elemSize; @@ -1320,6 +1325,7 @@ namespace VegaISA unsigned swizzleOffset = soffset.rawData() + offset; for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { + swizzleOffset += instData.SVE ? voffset[lane] : 0; gpuDynInst->addr.at(lane) = flat_scratch_addr + swizzle(swizzleOffset, lane, elemSize); } @@ -1328,7 +1334,9 @@ namespace VegaISA assert(isFlatScratch()); ConstVecOperandU32 voffset(gpuDynInst, vaddr); - voffset.read(); + if (instData.SVE) { + voffset.read(); + } Addr flat_scratch_addr = readFlatScratch(gpuDynInst); @@ -1343,8 +1351,11 @@ namespace VegaISA for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { + VecElemU32 vgpr_offset = + instData.SVE ? voffset[lane] : 0; + gpuDynInst->addr.at(lane) = flat_scratch_addr - + swizzle(voffset[lane] + offset, lane, elemSize); + + swizzle(vgpr_offset + offset, lane, elemSize); } } } From 2f5842d253c96bc9cac30a74815ff4a4fa7d27c9 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Sat, 15 Jun 2024 14:17:15 -0700 Subject: [PATCH 3/4] arch-vega: Add valid flag to ds_swizzle_b32 Currently the flag is just Load and there is a long comment explaining why. This does not meet any of the scoreboard check requirements: https://github.com/gem5/gem5/blob/develop/src/gpu-compute/scoreboard_check_stage.cc#L230-L241 Add a generic ALU flag as well so the instruction executes instead of panicking. Change-Id: I54b2d20d47fad5e8f05f927328433aab7db7d862 --- src/arch/amdgpu/vega/insts/ds.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/arch/amdgpu/vega/insts/ds.cc b/src/arch/amdgpu/vega/insts/ds.cc index 57d58638c5..c377daa487 100644 --- a/src/arch/amdgpu/vega/insts/ds.cc +++ b/src/arch/amdgpu/vega/insts/ds.cc @@ -1997,6 +1997,7 @@ namespace VegaISA * fits in better with the LDS pipeline logic. */ setFlag(Load); + setFlag(ALU); } // Inst_DS__DS_SWIZZLE_B32 Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32() From 2b0ca93517f0bb3f50475bf92997a6aa6c354dc7 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Sat, 15 Jun 2024 15:46:33 -0700 Subject: [PATCH 4/4] gpu-compute: Fix architected flat scratch Currently writing to SRF which is incorrect, as the physical register number can be clobbered by another wavefront if registers get renamed to the physical register number. Fix this by actually architecting the register, i.e., there is a dedicated "hardware" register in the wavefront class. Change-Id: I94e9e463eed348b2928cae884c1c20566c00984d --- src/gpu-compute/gpu_dyn_inst.cc | 12 +++--------- src/gpu-compute/wavefront.cc | 13 ++++++------- src/gpu-compute/wavefront.hh | 3 +++ 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index d4a6a8f447..c4a8e9085a 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -925,20 +925,14 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask) ComputeUnit *cu = wavefront()->computeUnit; if (wavefront()->gfxVersion == GfxVersion::gfx942) { - // Architected flat scratch base address in FLAT_SCRATCH registers - uint32_t fs_lo = cu->srf[simdId]->read( - VegaISA::REG_FLAT_SCRATCH_LO); - uint32_t fs_hi = cu->srf[simdId]->read( - VegaISA::REG_FLAT_SCRATCH_HI); - - Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo; - + // Architected flat scratch base address is in a dedicated hardware + // register. for (int lane = 0; lane < cu->wfSize(); ++lane) { if (mask[lane]) { // The scratch base is added for other gfx versions, // otherwise this would simply add the register base. addr[lane] = addr[lane] - cu->shader->getScratchBase() - + arch_flat_scratch; + + wavefront()->archFlatScratchAddr; } } } else { diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index 1b94b13b6e..d14f8aee3c 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -384,14 +384,13 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems) // the FLAT_SCRATCH register pair to the scratch backing // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch if (task->gfxVersion() == GfxVersion::gfx942) { - Addr arch_flat_scratch = + archFlatScratchAddr = task->amdQueue.scratch_backing_memory_location; - computeUnit->srf[simdId]->write( - VegaISA::REG_FLAT_SCRATCH_HI, - bits(arch_flat_scratch, 63, 32)); - computeUnit->srf[simdId]->write( - VegaISA::REG_FLAT_SCRATCH_LO, - bits(arch_flat_scratch, 31, 0)); + + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting architected flat scratch = %x\n", + computeUnit->cu_id, simdId, wfSlotId, wfDynId, + archFlatScratchAddr); break; } diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh index b7dff4617b..476393603b 100644 --- a/src/gpu-compute/wavefront.hh +++ b/src/gpu-compute/wavefront.hh @@ -205,6 +205,9 @@ class Wavefront : public SimObject // will live while the WF is executed uint32_t startSgprIndex; + // Architected flat scratch address for MI300+ + Addr archFlatScratchAddr = 0; + // Old value of destination gpr (for trace) std::vector oldVgpr; // Id of destination gpr (for trace)