From 3a73aa3ac1814d0311989cd2b7235d44a6741c07 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Thu, 14 Jul 2022 14:47:54 -0700 Subject: [PATCH] arch-vega: Implement new VOP2 using VOP3 insts Vega adds three new VOP2 instructions that may use VOP3 encoding that are not part of the GCN3 ISA: v_add_u32, v_sub_u32, v_subrev_u32. This changeset implements those three new instructions to fix errors related to "invalid encoding" when those instructions are seen. Tested using srad from Rodinia 3.0 HIP port which compiles a v_add_u32 instruction with VOP3 encoding. Change-Id: I409a9f72f5c37895c3a0ab7ceb14a4dd121874a4 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/61330 Maintainer: Matt Sinclair Tested-by: kokoro Reviewed-by: Matt Sinclair --- src/arch/amdgpu/vega/decoder.cc | 24 +++- src/arch/amdgpu/vega/gpu_decoder.hh | 3 + src/arch/amdgpu/vega/insts/instructions.cc | 129 +++++++++++++++++++++ src/arch/amdgpu/vega/insts/instructions.hh | 102 ++++++++++++++++ 4 files changed, 255 insertions(+), 3 deletions(-) diff --git a/src/arch/amdgpu/vega/decoder.cc b/src/arch/amdgpu/vega/decoder.cc index f716636978..c4dfe9a657 100644 --- a/src/arch/amdgpu/vega/decoder.cc +++ b/src/arch/amdgpu/vega/decoder.cc @@ -877,9 +877,9 @@ namespace VegaISA &Decoder::decode_OPU_VOP3__V_MIN_U16, &Decoder::decode_OPU_VOP3__V_MIN_I16, &Decoder::decode_OPU_VOP3__V_LDEXP_F16, - &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, + &Decoder::decode_OPU_VOP3__V_ADD_U32, + &Decoder::decode_OPU_VOP3__V_SUB_U32, + &Decoder::decode_OPU_VOP3__V_SUBREV_U32, &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, @@ -6105,6 +6105,24 @@ namespace VegaISA return new Inst_VOP3__V_LDEXP_F16(&iFmt->iFmt_VOP3A); } // decode_OPU_VOP3__V_LDEXP_F16 + GPUStaticInst* + Decoder::decode_OPU_VOP3__V_ADD_U32(MachInst iFmt) + { + return new Inst_VOP3__V_ADD_U32(&iFmt->iFmt_VOP3A); + } // decode_OPU_VOP3__V_ADD_U32 + + GPUStaticInst* + Decoder::decode_OPU_VOP3__V_SUB_U32(MachInst iFmt) + { + return new Inst_VOP3__V_SUB_U32(&iFmt->iFmt_VOP3A); + } // decode_OPU_VOP3__V_SUB_U32 + + GPUStaticInst* + Decoder::decode_OPU_VOP3__V_SUBREV_U32(MachInst iFmt) + { + return new Inst_VOP3__V_SUBREV_U32(&iFmt->iFmt_VOP3A); + } // decode_OPU_VOP3__V_SUBREV_U32 + GPUStaticInst* Decoder::decode_OPU_VOP3__V_NOP(MachInst iFmt) { diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh index 1ca292675e..1be43861df 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.hh +++ b/src/arch/amdgpu/vega/gpu_decoder.hh @@ -322,6 +322,9 @@ namespace VegaISA GPUStaticInst* decode_OPU_VOP3__V_MIN_U16(MachInst); GPUStaticInst* decode_OPU_VOP3__V_MIN_I16(MachInst); GPUStaticInst* decode_OPU_VOP3__V_LDEXP_F16(MachInst); + GPUStaticInst* decode_OPU_VOP3__V_ADD_U32(MachInst); + GPUStaticInst* decode_OPU_VOP3__V_SUB_U32(MachInst); + GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_NOP(MachInst); GPUStaticInst* decode_OPU_VOP3__V_MOV_B32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_CVT_I32_F64(MachInst); diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc index 877bb0960a..9de8a4113e 100644 --- a/src/arch/amdgpu/vega/insts/instructions.cc +++ b/src/arch/amdgpu/vega/insts/instructions.cc @@ -27400,6 +27400,135 @@ namespace VegaISA { panicUnimplemented(); } // execute + // --- Inst_VOP3__V_ADD_U32 class methods --- + + Inst_VOP3__V_ADD_U32::Inst_VOP3__V_ADD_U32(InFmt_VOP3A *iFmt) + : Inst_VOP3A(iFmt, "v_add_u32", false) + { + setFlag(ALU); + } // Inst_VOP3__V_ADD_U32 + + Inst_VOP3__V_ADD_U32::~Inst_VOP3__V_ADD_U32() + { + } // ~Inst_VOP3__V_ADD_U32 + + // --- description from .arch file --- + // D.u32 = S0.u32 + S1.u32. + void + Inst_VOP3__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU32 src0(gpuDynInst, extData.SRC0); + ConstVecOperandU32 src1(gpuDynInst, extData.SRC1); + VecOperandU32 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.readSrc(); + + /** + * input modifiers are supported by FP operations only + */ + assert(!(instData.ABS & 0x1)); + assert(!(instData.ABS & 0x2)); + assert(!(instData.ABS & 0x4)); + assert(!(extData.NEG & 0x1)); + assert(!(extData.NEG & 0x2)); + assert(!(extData.NEG & 0x4)); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src0[lane] + src1[lane]; + } + } + + vdst.write(); + } // execute + // --- Inst_VOP3__V_SUB_U32 class methods --- + + Inst_VOP3__V_SUB_U32::Inst_VOP3__V_SUB_U32(InFmt_VOP3A *iFmt) + : Inst_VOP3A(iFmt, "v_sub_u32", false) + { + setFlag(ALU); + } // Inst_VOP3__V_SUB_U32 + + Inst_VOP3__V_SUB_U32::~Inst_VOP3__V_SUB_U32() + { + } // ~Inst_VOP3__V_SUB_U32 + + // --- description from .arch file --- + // D.u32 = S0.u32 - S1.u32. + void + Inst_VOP3__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU32 src0(gpuDynInst, extData.SRC0); + ConstVecOperandU32 src1(gpuDynInst, extData.SRC1); + VecOperandU32 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.readSrc(); + + /** + * input modifiers are supported by FP operations only + */ + assert(!(instData.ABS & 0x1)); + assert(!(instData.ABS & 0x2)); + assert(!(instData.ABS & 0x4)); + assert(!(extData.NEG & 0x1)); + assert(!(extData.NEG & 0x2)); + assert(!(extData.NEG & 0x4)); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src0[lane] - src1[lane]; + } + } + + vdst.write(); + } // execute + // --- Inst_VOP3__V_SUBREV_U32 class methods --- + + Inst_VOP3__V_SUBREV_U32::Inst_VOP3__V_SUBREV_U32(InFmt_VOP3A *iFmt) + : Inst_VOP3A(iFmt, "v_subrev_u32", false) + { + setFlag(ALU); + } // Inst_VOP3__V_SUBREV_U32 + + Inst_VOP3__V_SUBREV_U32::~Inst_VOP3__V_SUBREV_U32() + { + } // ~Inst_VOP3__V_SUBREV_U32 + + // --- description from .arch file --- + // D.u32 = S1.u32 - S0.u32. + void + Inst_VOP3__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU32 src0(gpuDynInst, extData.SRC0); + ConstVecOperandU32 src1(gpuDynInst, extData.SRC1); + VecOperandU32 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.readSrc(); + + /** + * input modifiers are supported by FP operations only + */ + assert(!(instData.ABS & 0x1)); + assert(!(instData.ABS & 0x2)); + assert(!(instData.ABS & 0x4)); + assert(!(extData.NEG & 0x1)); + assert(!(extData.NEG & 0x2)); + assert(!(extData.NEG & 0x4)); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src1[lane] - src0[lane]; + } + } + + vdst.write(); + } // execute // --- Inst_VOP3__V_NOP class methods --- Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt) diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index e9361c3cc4..74180900de 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -25712,6 +25712,108 @@ namespace VegaISA void execute(GPUDynInstPtr) override; }; // Inst_VOP3__V_LDEXP_F16 + class Inst_VOP3__V_ADD_U32 : public Inst_VOP3A + { + public: + Inst_VOP3__V_ADD_U32(InFmt_VOP3A*); + ~Inst_VOP3__V_ADD_U32(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 2; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //src_0 + return 4; + case 1: //src_1 + return 4; + case 2: //vdst + return 4; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP3__V_ADD_U32 + + class Inst_VOP3__V_SUB_U32 : public Inst_VOP3A + { + public: + Inst_VOP3__V_SUB_U32(InFmt_VOP3A*); + ~Inst_VOP3__V_SUB_U32(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 2; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //src_0 + return 4; + case 1: //src_1 + return 4; + case 2: //vdst + return 4; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP3__V_SUB_U32 + + class Inst_VOP3__V_SUBREV_U32 : public Inst_VOP3A + { + public: + Inst_VOP3__V_SUBREV_U32(InFmt_VOP3A*); + ~Inst_VOP3__V_SUBREV_U32(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 2; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //src_0 + return 4; + case 1: //src_1 + return 4; + case 2: //vdst + return 4; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP3__V_SUBREV_U32 + class Inst_VOP3__V_NOP : public Inst_VOP3A { public: