From 9ab004cccca501b56cd39be1cefc677fd12b7a4c Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 13 Feb 2024 16:34:05 -0600 Subject: [PATCH] arch-vega: Implement V_LSHL_ADD_U64 This is a new instruction in MI300 and operates similar to V_LSHL_ADD_U32 but on 64-bit values. Change-Id: Ia4ac65160bdad748fccdcb28286ba03157cc4046 --- src/arch/amdgpu/vega/gpu_decoder.cc | 8 +++- src/arch/amdgpu/vega/gpu_decoder.hh | 1 + src/arch/amdgpu/vega/insts/instructions.hh | 36 ++++++++++++++++ src/arch/amdgpu/vega/insts/vop3.cc | 48 ++++++++++++++++++++++ 4 files changed, 92 insertions(+), 1 deletion(-) diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc index 2220d820b1..406ada6c52 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.cc +++ b/src/arch/amdgpu/vega/gpu_decoder.cc @@ -1091,7 +1091,7 @@ namespace VegaISA &Decoder::decode_OPU_VOP3__V_MAD_I16, &Decoder::decode_OPU_VOP3__V_FMA_F16, &Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F16, - &Decoder::decode_invalid, + &Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64, &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, @@ -7054,6 +7054,12 @@ namespace VegaISA return new Inst_VOP3__V_DIV_FIXUP_F16(&iFmt->iFmt_VOP3A); } + GPUStaticInst* + Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst iFmt) + { + return new Inst_VOP3__V_LSHL_ADD_U64(&iFmt->iFmt_VOP3A); + } + GPUStaticInst* Decoder::decode_OPU_VOP3__V_INTERP_P1_F32(MachInst iFmt) { diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh index 48084a6913..d3b39fd945 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.hh +++ b/src/arch/amdgpu/vega/gpu_decoder.hh @@ -470,6 +470,7 @@ namespace VegaISA GPUStaticInst* decode_OPU_VOP3__V_MAD_I16(MachInst); GPUStaticInst* decode_OPU_VOP3__V_FMA_F16(MachInst); GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F16(MachInst); + GPUStaticInst* decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst); GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1_F32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_INTERP_P2_F32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_INTERP_MOV_F32(MachInst); diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index db03548a3d..4c96a3e34b 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -30158,6 +30158,42 @@ namespace VegaISA void execute(GPUDynInstPtr) override; }; // Inst_VOP3__V_DIV_FIXUP_F16 + class Inst_VOP3__V_LSHL_ADD_U64 : public Inst_VOP3A + { + public: + Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A*); + ~Inst_VOP3__V_LSHL_ADD_U64(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 3; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //src_0 + return 8; + case 1: //src_1 + return 4; + case 2: //src_2 + return 8; + case 3: //vdst + return 8; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP3__V_LSHL_ADD_U64 + class Inst_VOP3__V_CVT_PKACCUM_U8_F32 : public Inst_VOP3A { public: diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc index 8f6794c9c2..f78f64bc91 100644 --- a/src/arch/amdgpu/vega/insts/vop3.cc +++ b/src/arch/amdgpu/vega/insts/vop3.cc @@ -7630,6 +7630,54 @@ namespace VegaISA { panicUnimplemented(); } // execute + // --- Inst_VOP3__V_LSHL_ADD_U64 class methods --- + + Inst_VOP3__V_LSHL_ADD_U64::Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A *iFmt) + : Inst_VOP3A(iFmt, "v_lshl_add_u64", false) + { + setFlag(ALU); + } // Inst_VOP3__V_LSHL_ADD_U64 + + Inst_VOP3__V_LSHL_ADD_U64::~Inst_VOP3__V_LSHL_ADD_U64() + { + } // ~Inst_VOP3__V_LSHL_ADD_U64 + + // --- description from .arch file --- + // D.u = (S0.u << S1.u[4:0]) + S2.u. + void + Inst_VOP3__V_LSHL_ADD_U64::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU64 src0(gpuDynInst, extData.SRC0); + ConstVecOperandU32 src1(gpuDynInst, extData.SRC1); + ConstVecOperandU64 src2(gpuDynInst, extData.SRC2); + VecOperandU64 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.readSrc(); + src2.readSrc(); + + /** + * input modifiers are supported by FP operations only + */ + assert(!(instData.ABS & 0x1)); + assert(!(instData.ABS & 0x2)); + assert(!(instData.ABS & 0x4)); + assert(!(extData.NEG & 0x1)); + assert(!(extData.NEG & 0x2)); + assert(!(extData.NEG & 0x4)); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + int shift_amount = bits(src1[lane], 2, 0); + shift_amount = shift_amount > 4 ? 0 : shift_amount; + vdst[lane] = (src0[lane] << shift_amount) + + src2[lane]; + } + } + + vdst.write(); + } // execute // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods --- Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(