diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc index 940840719b..969d318c06 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.cc +++ b/src/arch/amdgpu/vega/gpu_decoder.cc @@ -1238,6 +1238,7 @@ namespace VegaISA &Decoder::decode_OPU_VOP3__V_CVT_PK_I16_I32, &Decoder::decode_OPU_VOP3__V_PKNORM_I16_F16, &Decoder::decode_OPU_VOP3__V_PKNORM_U16_F16, + &Decoder::decode_invalid, &Decoder::decode_OPU_VOP3__V_ADD_I32, &Decoder::decode_OPU_VOP3__V_SUB_I32, &Decoder::decode_OPU_VOP3__V_ADD_I16, @@ -1337,7 +1338,6 @@ namespace VegaISA &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, - &Decoder::decode_invalid, &Decoder::decode_invalid }; @@ -4217,8 +4217,7 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_VOP2__V_XNOR_B32(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP2__V_XNOR_B32(&iFmt->iFmt_VOP2); } GPUStaticInst* diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index db03548a3d..4151c2cb8b 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -8132,6 +8132,40 @@ namespace VegaISA void execute(GPUDynInstPtr) override; }; // Inst_VOP2__V_FMAC_F32 + class Inst_VOP2__V_XNOR_B32 : public Inst_VOP2 + { + public: + Inst_VOP2__V_XNOR_B32(InFmt_VOP2*); + ~Inst_VOP2__V_XNOR_B32(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 2; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //src_0 + return 4; + case 1: //src_1 + return 4; + case 2: //vdst + return 4; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP2__V_XNOR_B32 + class Inst_VOP1__V_NOP : public Inst_VOP1 { public: diff --git a/src/arch/amdgpu/vega/insts/vop2.cc b/src/arch/amdgpu/vega/insts/vop2.cc index ddd77e27da..55146711b6 100644 --- a/src/arch/amdgpu/vega/insts/vop2.cc +++ b/src/arch/amdgpu/vega/insts/vop2.cc @@ -2167,9 +2167,9 @@ namespace VegaISA Inst_VOP2__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst) { Wavefront *wf = gpuDynInst->wavefront(); - ConstVecOperandU32 src0(gpuDynInst, instData.SRC0); - ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1); - VecOperandU32 vdst(gpuDynInst, instData.VDST); + ConstVecOperandF32 src0(gpuDynInst, instData.SRC0); + ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1); + VecOperandF32 vdst(gpuDynInst, instData.VDST); src0.readSrc(); src1.read(); @@ -2181,6 +2181,40 @@ namespace VegaISA } } + vdst.write(); + } // execute + // --- Inst_VOP2__V_XNOR_B32 class methods --- + + Inst_VOP2__V_XNOR_B32::Inst_VOP2__V_XNOR_B32(InFmt_VOP2 *iFmt) + : Inst_VOP2(iFmt, "v_xnor_b32") + { + setFlag(ALU); + } // Inst_VOP2__V_XNOR_B32 + + Inst_VOP2__V_XNOR_B32::~Inst_VOP2__V_XNOR_B32() + { + } // ~Inst_VOP2__V_XNOR_B32 + + // --- description from .arch file --- + // D.u = S1.u - S0.u; + void + Inst_VOP2__V_XNOR_B32::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU32 src0(gpuDynInst, instData.SRC0); + ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1); + VecOperandU32 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.read(); + vdst.read(); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = ~(src0[lane] ^ src1[lane]); + } + } + vdst.write(); } // execute } // namespace VegaISA diff --git a/src/arch/amdgpu/vega/insts/vop3p.cc b/src/arch/amdgpu/vega/insts/vop3p.cc index 224c525e0f..96c296df67 100644 --- a/src/arch/amdgpu/vega/insts/vop3p.cc +++ b/src/arch/amdgpu/vega/insts/vop3p.cc @@ -666,6 +666,9 @@ Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst) int opsel = instData.OPSEL; int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2); + int neg = extData.NEG; + int neg_hi = instData.NEG_HI; + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32) @@ -675,9 +678,15 @@ Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst) uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32) : bits(src2[lane], 31, 0); - float dword1 = std::fma(*reinterpret_cast(&s0l), - *reinterpret_cast(&s1l), - *reinterpret_cast(&s2l)); + float s0lf = *reinterpret_cast(&s0l); + float s1lf = *reinterpret_cast(&s1l); + float s2lf = *reinterpret_cast(&s2l); + + if (neg & 1) s0lf = -s0lf; + if (neg & 1) s1lf = -s1lf; + if (neg & 1) s2lf = -s2lf; + + float dword1 = std::fma(s0lf, s1lf, s2lf); uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32) : bits(src0[lane], 31, 0); @@ -686,9 +695,15 @@ Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst) uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32) : bits(src2[lane], 31, 0); - float dword2 = std::fma(*reinterpret_cast(&s0h), - *reinterpret_cast(&s1h), - *reinterpret_cast(&s2h)); + float s0hf = *reinterpret_cast(&s0h); + float s1hf = *reinterpret_cast(&s1h); + float s2hf = *reinterpret_cast(&s2h); + + if (neg_hi & 1) s0hf = -s0hf; + if (neg_hi & 1) s1hf = -s1hf; + if (neg_hi & 1) s2hf = -s2hf; + + float dword2 = std::fma(s0hf, s1hf, s2hf); uint32_t result1 = *reinterpret_cast(&dword1); uint32_t result2 = *reinterpret_cast(&dword2); @@ -731,6 +746,9 @@ Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst) int opsel = instData.OPSEL; int opsel_hi = extData.OPSEL_HI; + int neg = extData.NEG; + int neg_hi = instData.NEG_HI; + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32) @@ -738,16 +756,26 @@ Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst) uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32) : bits(src1[lane], 31, 0); - float dword1 = *reinterpret_cast(&lower_dword) - * *reinterpret_cast(&upper_dword); + float ldwordf = *reinterpret_cast(&lower_dword); + float udwordf = *reinterpret_cast(&upper_dword); + + if (neg & 1) ldwordf = -ldwordf; + if (neg & 2) udwordf = -udwordf; + + float dword1 = ldwordf * udwordf; lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32) : bits(src0[lane], 31, 0); upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32) : bits(src1[lane], 31, 0); - float dword2 = *reinterpret_cast(&lower_dword) - * *reinterpret_cast(&upper_dword); + ldwordf = *reinterpret_cast(&lower_dword); + udwordf = *reinterpret_cast(&upper_dword); + + if (neg_hi & 1) ldwordf = -ldwordf; + if (neg_hi & 2) udwordf = -udwordf; + + float dword2 = ldwordf * udwordf; uint32_t result1 = *reinterpret_cast(&dword1); uint32_t result2 = *reinterpret_cast(&dword2); @@ -787,9 +815,15 @@ Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst) src0.readSrc(); src1.readSrc(); + panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode); + panic_if(isDPPInst(), "DPP not supported for %s", _opcode); + int opsel = instData.OPSEL; int opsel_hi = extData.OPSEL_HI; + int neg = extData.NEG; + int neg_hi = instData.NEG_HI; + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32) @@ -797,16 +831,26 @@ Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst) uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32) : bits(src1[lane], 31, 0); - float dword1 = *reinterpret_cast(&lower_dword) - + *reinterpret_cast(&upper_dword); + float ldwordf = *reinterpret_cast(&lower_dword); + float udwordf = *reinterpret_cast(&upper_dword); + + if (neg & 1) ldwordf = -ldwordf; + if (neg & 2) udwordf = -udwordf; + + float dword1 = ldwordf + udwordf; lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32) : bits(src0[lane], 31, 0); upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32) : bits(src1[lane], 31, 0); - float dword2 = *reinterpret_cast(&lower_dword) - + *reinterpret_cast(&upper_dword); + ldwordf = *reinterpret_cast(&lower_dword); + udwordf = *reinterpret_cast(&upper_dword); + + if (neg_hi & 1) ldwordf = -ldwordf; + if (neg_hi & 2) udwordf = -udwordf; + + float dword2 = ldwordf + udwordf; uint32_t result1 = *reinterpret_cast(&dword1); uint32_t result2 = *reinterpret_cast(&dword2); @@ -845,9 +889,11 @@ Inst_VOP3P__V_PK_MOV_B32::execute(GPUDynInstPtr gpuDynInst) // Only OPSEL[1:0] are used // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0 - int opsel = instData.OPSEL; + warn_if(instData.NEG_HI || extData.NEG, + "Negative modifier undefined for %s", _opcode); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1