From 7cdb69bf21b84ac5182ef47bf11990393427fc0a Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 31 May 2024 09:27:10 -0700 Subject: [PATCH 1/3] arch-vega: Fill in scratch insts to match flat/global Flat, scratch, and global share the same instruction implementation with different address calculations essentially. These instructions were already implemented but not added to the decoder. This commit adds the remaining scratch instructions which have a shared instruction implementation. Change-Id: I8f2e9ceb221294dce1b81c45745b642f0592d985 --- src/arch/amdgpu/vega/gpu_decoder.cc | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc index 0f4b1e9872..8c12013ae8 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.cc +++ b/src/arch/amdgpu/vega/gpu_decoder.cc @@ -8622,7 +8622,6 @@ namespace VegaISA Decoder::decode_OP_GLOBAL__GLOBAL_STORE_DWORD(MachInst iFmt) { return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT); - return nullptr; } GPUStaticInst* @@ -9898,29 +9897,25 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_UBYTE(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_LOAD_UBYTE(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_SBYTE(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_LOAD_SBYTE(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_USHORT(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_LOAD_USHORT(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_SSHORT(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_LOAD_SSHORT(&iFmt->iFmt_FLAT); } GPUStaticInst* @@ -9950,8 +9945,7 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_STORE_BYTE(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_STORE_BYTE(&iFmt->iFmt_FLAT); } GPUStaticInst* @@ -9964,8 +9958,7 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_STORE_SHORT(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_STORE_SHORT(&iFmt->iFmt_FLAT); } GPUStaticInst* From 6c8caf83c6f0c0e49f6eb601b204e60f1d334149 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 15 May 2024 12:05:54 -0700 Subject: [PATCH 2/3] arch-vega: Implement V_ACCVGPR_MOV_B32 instruction This instruction is a simple move from accumulation register to accumulation register. It is essentially a move with the accumulation offset added to the register index. Change-Id: Ic93ae72599b75c91213f56ebafe5bbd7b2867089 --- src/arch/amdgpu/vega/gpu_decoder.cc | 8 +++++- src/arch/amdgpu/vega/gpu_decoder.hh | 1 + src/arch/amdgpu/vega/insts/instructions.hh | 32 ++++++++++++++++++++++ src/arch/amdgpu/vega/insts/vop1.cc | 32 ++++++++++++++++++++++ 4 files changed, 72 insertions(+), 1 deletion(-) diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc index 8c12013ae8..e07a392ced 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.cc +++ b/src/arch/amdgpu/vega/gpu_decoder.cc @@ -3144,7 +3144,7 @@ namespace VegaISA &Decoder::decode_OP_VOP1__V_SAT_PK_U8_I16, &Decoder::decode_invalid, &Decoder::decode_OP_VOP1__V_SWAP_B32, - &Decoder::decode_invalid, + &Decoder::decode_OP_VOP1__V_ACCVGPR_MOV_B32, &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, @@ -11777,6 +11777,12 @@ namespace VegaISA return nullptr; } + GPUStaticInst* + Decoder::decode_OP_VOP1__V_ACCVGPR_MOV_B32(MachInst iFmt) + { + return new Inst_VOP1__V_ACCVGPR_MOV_B32(&iFmt->iFmt_VOP1); + } + GPUStaticInst* Decoder::decode_OP_VOPC__V_CMP_CLASS_F32(MachInst iFmt) { diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh index 8094233bd8..2523734ce5 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.hh +++ b/src/arch/amdgpu/vega/gpu_decoder.hh @@ -1314,6 +1314,7 @@ namespace VegaISA GPUStaticInst* decode_OP_VOP1__V_CVT_NORM_U16_F16(MachInst); GPUStaticInst* decode_OP_VOP1__V_SAT_PK_U8_I16(MachInst); GPUStaticInst* decode_OP_VOP1__V_SWAP_B32(MachInst); + GPUStaticInst* decode_OP_VOP1__V_ACCVGPR_MOV_B32(MachInst); GPUStaticInst* decode_OP_VOP2__V_CNDMASK_B32(MachInst); GPUStaticInst* decode_OP_VOP2__V_ADD_F32(MachInst); GPUStaticInst* decode_OP_VOP2__V_SUB_F32(MachInst); diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index 9d91526f3f..4e71f13ad4 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -10562,6 +10562,38 @@ namespace VegaISA void execute(GPUDynInstPtr) override; }; // Inst_VOP1__V_LOG_LEGACY_F32 + class Inst_VOP1__V_ACCVGPR_MOV_B32 : public Inst_VOP1 + { + public: + Inst_VOP1__V_ACCVGPR_MOV_B32(InFmt_VOP1*); + ~Inst_VOP1__V_ACCVGPR_MOV_B32(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 1; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //src + return 4; + case 1: //vdst + return 4; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP1__V_ACCVGPR_MOV_B32 + class Inst_VOPC__V_CMP_CLASS_F32 : public Inst_VOPC { public: diff --git a/src/arch/amdgpu/vega/insts/vop1.cc b/src/arch/amdgpu/vega/insts/vop1.cc index 3bbf1e0085..f970923951 100644 --- a/src/arch/amdgpu/vega/insts/vop1.cc +++ b/src/arch/amdgpu/vega/insts/vop1.cc @@ -2397,6 +2397,38 @@ namespace VegaISA } } + vdst.write(); + } // execute + // --- Inst_VOP1__V_ACCVGPR_MOV_B32 class methods --- + + Inst_VOP1__V_ACCVGPR_MOV_B32:: + Inst_VOP1__V_ACCVGPR_MOV_B32(InFmt_VOP1 *iFmt) + : Inst_VOP1(iFmt, "v_accvgpr_mov_b32") + { + setFlag(ALU); + } // Inst_VOP1__V_ACCVGPR_MOV_B32 + + Inst_VOP1__V_ACCVGPR_MOV_B32::~Inst_VOP1__V_ACCVGPR_MOV_B32() + { + } // ~Inst_VOP1__V_ACCVGPR_MOV_B32 + + void + Inst_VOP1__V_ACCVGPR_MOV_B32::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + unsigned accum_offset = wf->accumOffset; + + ConstVecOperandU32 src(gpuDynInst, instData.SRC0+accum_offset); + VecOperandU32 vdst(gpuDynInst, instData.VDST+accum_offset); + + src.readSrc(); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src[lane]; + } + } + vdst.write(); } // execute } // namespace VegaISA From 00dcd5b0bc9a817f7bf64bbac48f97c3a7a19c00 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 31 May 2024 13:40:20 -0700 Subject: [PATCH 3/3] arch-vega: Implement literals for 64b dest operands This feature has been available since Vega10 but was never implemented. MI300 adds a few new instructions that make use of this more often (e.g., v_mov_b64). Change-Id: Ieeb7834462b76d77c0030f49622d0de09f90c9e4 --- src/arch/amdgpu/vega/operand.hh | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/arch/amdgpu/vega/operand.hh b/src/arch/amdgpu/vega/operand.hh index 593f0e34fd..1bb9b43d1f 100644 --- a/src/arch/amdgpu/vega/operand.hh +++ b/src/arch/amdgpu/vega/operand.hh @@ -579,8 +579,30 @@ namespace VegaISA case REG_SRC_SWDA: case REG_SRC_DPP: case REG_SRC_LITERAL: - assert(NumDwords == 1); + /** + * From the Vega specification: + * When a literal constant is used with a 64 bit instruction, + * the literal is expanded to 64 bits by: padding the LSBs + * with zeros for floats, padding the MSBs with zeros for + * unsigned ints, and by sign-extending signed ints. + */ srfData[0] = _gpuDynInst->srcLiteral(); + if constexpr (NumDwords == 2) { + if constexpr (std::is_integral_v) { + if constexpr (std::is_signed_v) { + if (bits(srfData[0], 31, 31) == 1) { + srfData[1] = 0xffffffff; + } else { + srfData[1] = 0; + } + } else { + srfData[1] = 0; + } + } else { + srfData[1] = _gpuDynInst->srcLiteral(); + srfData[0] = 0; + } + } break; case REG_SHARED_BASE: {