From 420cda1befd073b17d4f4ccfc639cd5e4df9bf2b Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 8 Dec 2023 12:12:16 -0600 Subject: [PATCH] arch-vega: Implement FP32 packed math Starting with MI200, packed math can operate on double dword inputs. In this case, 64-bits of inputs (two VGPRs per lane) contain two FP32 values. Add instructions to perform add, multiply, and FMA on packed FP32 types. Change-Id: Ib838bff91a10e02e013cc7c33ec3d91ff08647b0 --- src/arch/amdgpu/vega/decoder.cc | 36 ++- src/arch/amdgpu/vega/gpu_decoder.hh | 3 + src/arch/amdgpu/vega/insts/instructions.cc | 279 +++++++++++++++++++++ src/arch/amdgpu/vega/insts/instructions.hh | 175 +++++++++++++ 4 files changed, 482 insertions(+), 11 deletions(-) diff --git a/src/arch/amdgpu/vega/decoder.cc b/src/arch/amdgpu/vega/decoder.cc index e7bea7c33b..27474d5109 100644 --- a/src/arch/amdgpu/vega/decoder.cc +++ b/src/arch/amdgpu/vega/decoder.cc @@ -3627,9 +3627,9 @@ namespace VegaISA &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, + &Decoder::decode_OP_VOP3P__V_PK_FMA_F32, + &Decoder::decode_OP_VOP3P__V_PK_MUL_F32, + &Decoder::decode_OP_VOP3P__V_PK_ADD_F32, &Decoder::decode_OP_VOP3P__V_PK_MOV_B32, &Decoder::decode_invalid, &Decoder::decode_invalid, @@ -4203,8 +4203,7 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_VOP2__V_FMAC_F32(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP2__V_FMAC_F32(&iFmt->iFmt_VOP2); } GPUStaticInst* @@ -8293,8 +8292,7 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_FLAT__FLAT_STORE_SHORT_D16_HI(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT); } GPUStaticInst* @@ -8607,8 +8605,7 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_GLOBAL__GLOBAL_STORE_SHORT_D16_HI(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT); } GPUStaticInst* @@ -9968,8 +9965,7 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_STORE_SHORT_D16_HI(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT); } GPUStaticInst* @@ -13105,6 +13101,24 @@ namespace VegaISA return nullptr; } + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_PK_FMA_F32(MachInst iFmt) + { + return new Inst_VOP3P__V_PK_FMA_F32(&iFmt->iFmt_VOP3P); + } + + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_PK_MUL_F32(MachInst iFmt) + { + return new Inst_VOP3P__V_PK_MUL_F32(&iFmt->iFmt_VOP3P); + } + + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_PK_ADD_F32(MachInst iFmt) + { + return new Inst_VOP3P__V_PK_ADD_F32(&iFmt->iFmt_VOP3P); + } + GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_MOV_B32(MachInst iFmt) { diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh index 4ebb95e5f4..337011cdb8 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.hh +++ b/src/arch/amdgpu/vega/gpu_decoder.hh @@ -1593,6 +1593,9 @@ namespace VegaISA GPUStaticInst* decode_OP_VOP3P__V_MAD_MIX_F32(MachInst); GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXLO_F16(MachInst); GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXHI_F16(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_PK_FMA_F32(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_PK_MUL_F32(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_PK_ADD_F32(MachInst); GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst); GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst); GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst); diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc index 9104f29228..5f951f860e 100644 --- a/src/arch/amdgpu/vega/insts/instructions.cc +++ b/src/arch/amdgpu/vega/insts/instructions.cc @@ -8129,6 +8129,40 @@ namespace VegaISA vdst.write(); } // execute + // --- Inst_VOP2__V_FMAC_F32 class methods --- + + Inst_VOP2__V_FMAC_F32::Inst_VOP2__V_FMAC_F32(InFmt_VOP2 *iFmt) + : Inst_VOP2(iFmt, "v_fmac_f32") + { + setFlag(ALU); + } // Inst_VOP2__V_FMAC_F32 + + Inst_VOP2__V_FMAC_F32::~Inst_VOP2__V_FMAC_F32() + { + } // ~Inst_VOP2__V_FMAC_F32 + + // --- description from .arch file --- + // D.u = S1.u - S0.u; + void + Inst_VOP2__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU32 src0(gpuDynInst, instData.SRC0); + ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1); + VecOperandU32 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.read(); + vdst.read(); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]); + } + } + + vdst.write(); + } // execute // --- Inst_VOP1__V_NOP class methods --- Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt) @@ -44497,6 +44531,66 @@ namespace VegaISA Inst_FLAT__FLAT_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst) { } // completeAcc + // --- Inst_FLAT__FLAT_STORE_SHORT_D16_HI class methods --- + + Inst_FLAT__FLAT_STORE_SHORT_D16_HI:: + Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT *iFmt) + : Inst_FLAT(iFmt, "flat_store_short_d16_hi") + { + setFlag(MemoryRef); + setFlag(Store); + } // Inst_FLAT__FLAT_STORE_SHORT_D16_HI + + Inst_FLAT__FLAT_STORE_SHORT_D16_HI::~Inst_FLAT__FLAT_STORE_SHORT_D16_HI() + { + } // ~Inst_FLAT__FLAT_STORE_SHORT_D16_HI + + // --- description from .arch file --- + // Untyped buffer store short. + void + Inst_FLAT__FLAT_STORE_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + + if (gpuDynInst->exec_mask.none()) { + wf->decVMemInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } + wf->decExpInstsIssued(); + return; + } + + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU32 data(gpuDynInst, extData.DATA); + + data.read(); + + calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast(gpuDynInst->d_data))[lane] + = (data[lane] >> 16); + } + } + + issueRequestHelper(gpuDynInst); + } // execute + + void + Inst_FLAT__FLAT_STORE_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst) + { + initMemWrite(gpuDynInst); + } // initiateAcc + + void + Inst_FLAT__FLAT_STORE_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst) + { + } // completeAcc // --- Inst_FLAT__FLAT_STORE_DWORD class methods --- Inst_FLAT__FLAT_STORE_DWORD::Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT *iFmt) @@ -45995,6 +46089,191 @@ namespace VegaISA { atomicComplete(gpuDynInst); } // completeAcc + // --- Inst_VOP3P__V_PK_FMA_F32 class methods --- + + Inst_VOP3P__V_PK_FMA_F32::Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *iFmt) + : Inst_VOP3P(iFmt, "v_pk_fma_f32") + { + setFlag(ALU); + } // Inst_VOP3P__V_PK_FMA_F32 + + Inst_VOP3P__V_PK_FMA_F32::~Inst_VOP3P__V_PK_FMA_F32() + { + } // ~Inst_VOP3P__V_PK_FMA_F32 + + // D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] = + // S0.f[31:0] * S1.f[31:0] + S2.f[31:0] . + void + Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst) + { + // This is a special case of packed instructions which operates on + // 64-bit inputs/outputs and not 32-bit. U64 is used here as float + // values cannot use bitwise operations. Consider the U64 to imply + // untyped 64-bits of data. + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU64 src0(gpuDynInst, extData.SRC0); + ConstVecOperandU64 src1(gpuDynInst, extData.SRC1); + ConstVecOperandU64 src2(gpuDynInst, extData.SRC2); + VecOperandU64 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.readSrc(); + src2.readSrc(); + + int opsel = instData.OPSEL; + int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32) + : bits(src0[lane], 31, 0); + uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32) + : bits(src1[lane], 31, 0); + uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32) + : bits(src2[lane], 31, 0); + + float dword1 = std::fma(*reinterpret_cast(&s0l), + *reinterpret_cast(&s1l), + *reinterpret_cast(&s2l)); + + uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32) + : bits(src0[lane], 31, 0); + uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32) + : bits(src1[lane], 31, 0); + uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32) + : bits(src2[lane], 31, 0); + + float dword2 = std::fma(*reinterpret_cast(&s0h), + *reinterpret_cast(&s1h), + *reinterpret_cast(&s2h)); + + uint64_t result1 = *reinterpret_cast(&dword1); + uint64_t result2 = *reinterpret_cast(&dword2); + + vdst[lane] = (result2 << 32) | result1; + } + } + + vdst.write(); + } // execute + // --- Inst_VOP3P__V_PK_MUL_F32 class methods --- + + Inst_VOP3P__V_PK_MUL_F32::Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *iFmt) + : Inst_VOP3P(iFmt, "v_pk_mul_f32") + { + setFlag(ALU); + } // Inst_VOP3P__V_PK_MUL_F32 + + Inst_VOP3P__V_PK_MUL_F32::~Inst_VOP3P__V_PK_MUL_F32() + { + } // ~Inst_VOP3P__V_PK_MUL_F32 + + // D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] * + // S1.f[31:0] + void + Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst) + { + // This is a special case of packed instructions which operates on + // 64-bit inputs/outputs and not 32-bit. U64 is used here as float + // values cannot use bitwise operations. Consider the U64 to imply + // untyped 64-bits of data. + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU64 src0(gpuDynInst, extData.SRC0); + ConstVecOperandU64 src1(gpuDynInst, extData.SRC1); + VecOperandU64 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.readSrc(); + + int opsel = instData.OPSEL; + int opsel_hi = extData.OPSEL_HI; + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32) + : bits(src0[lane], 31, 0); + uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32) + : bits(src1[lane], 31, 0); + + float dword1 = *reinterpret_cast(&lower_dword) + * *reinterpret_cast(&upper_dword); + + lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32) + : bits(src0[lane], 31, 0); + upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32) + : bits(src1[lane], 31, 0); + + float dword2 = *reinterpret_cast(&lower_dword) + * *reinterpret_cast(&upper_dword); + + uint64_t result1 = *reinterpret_cast(&dword1); + uint64_t result2 = *reinterpret_cast(&dword2); + + vdst[lane] = (result2 << 32) | result1; + } + } + + vdst.write(); + } // execute + // --- Inst_VOP3P__V_PK_ADD_F32 class methods --- + + Inst_VOP3P__V_PK_ADD_F32::Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *iFmt) + : Inst_VOP3P(iFmt, "v_pk_add_f32") + { + setFlag(ALU); + } // Inst_VOP3P__V_PK_ADD_F32 + + Inst_VOP3P__V_PK_ADD_F32::~Inst_VOP3P__V_PK_ADD_F32() + { + } // ~Inst_VOP3P__V_PK_ADD_F32 + + // D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] + + // S1.f[31:0] + void + Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst) + { + // This is a special case of packed instructions which operates on + // 64-bit inputs/outputs and not 32-bit. U64 is used here as float + // values cannot use bitwise operations. Consider the U64 to imply + // untyped 64-bits of data. + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU64 src0(gpuDynInst, extData.SRC0); + ConstVecOperandU64 src1(gpuDynInst, extData.SRC1); + VecOperandU64 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.readSrc(); + + int opsel = instData.OPSEL; + int opsel_hi = extData.OPSEL_HI; + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32) + : bits(src0[lane], 31, 0); + uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32) + : bits(src1[lane], 31, 0); + + float dword1 = *reinterpret_cast(&lower_dword) + + *reinterpret_cast(&upper_dword); + + lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32) + : bits(src0[lane], 31, 0); + upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32) + : bits(src1[lane], 31, 0); + + float dword2 = *reinterpret_cast(&lower_dword) + + *reinterpret_cast(&upper_dword); + + uint64_t result1 = *reinterpret_cast(&dword1); + uint64_t result2 = *reinterpret_cast(&dword2); + + vdst[lane] = (result2 << 32) | result1; + } + } + + vdst.write(); + } // execute // --- Inst_VOP3P__V_PK_MOV_B32 class methods --- Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt) diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index c65f4c1609..db03548a3d 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -8098,6 +8098,40 @@ namespace VegaISA void execute(GPUDynInstPtr) override; }; // Inst_VOP2__V_SUBREV_U32 + class Inst_VOP2__V_FMAC_F32 : public Inst_VOP2 + { + public: + Inst_VOP2__V_FMAC_F32(InFmt_VOP2*); + ~Inst_VOP2__V_FMAC_F32(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 2; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //src_0 + return 4; + case 1: //src_1 + return 4; + case 2: //vdst + return 4; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP2__V_FMAC_F32 + class Inst_VOP1__V_NOP : public Inst_VOP1 { public: @@ -42280,6 +42314,43 @@ namespace VegaISA void completeAcc(GPUDynInstPtr) override; }; // Inst_FLAT__FLAT_STORE_SHORT + class Inst_FLAT__FLAT_STORE_SHORT_D16_HI : public Inst_FLAT + { + public: + Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT*); + ~Inst_FLAT__FLAT_STORE_SHORT_D16_HI(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 0; } + int numSrcRegOperands() override { return isFlat() ? 2 : 3; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //vgpr_addr + return vgprIsOffset() ? 4 : 8; + case 1: //vgpr_src + return 2; + case 2: //saddr + assert(!isFlat()); + return 8; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; + }; // Inst_FLAT__FLAT_STORE_SHORT_D16_HI + class Inst_FLAT__FLAT_STORE_DWORD : public Inst_FLAT { public: @@ -43637,6 +43708,110 @@ namespace VegaISA void completeAcc(GPUDynInstPtr) override; }; // Inst_FLAT__FLAT_ATOMIC_MAX_F64 + class Inst_VOP3P__V_PK_FMA_F32 : public Inst_VOP3P + { + public: + Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P*); + ~Inst_VOP3P__V_PK_FMA_F32(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 3; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: // src0 + return 8; + case 1: // src1 + return 8; + case 2: // src2 + return 8; + case 3: // dst + return 8; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP3P__V_PK_FMA_F32 + + class Inst_VOP3P__V_PK_MUL_F32 : public Inst_VOP3P + { + public: + Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P*); + ~Inst_VOP3P__V_PK_MUL_F32(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 2; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: // src0 + return 8; + case 1: // src1 + return 8; + case 2: // dst + return 8; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP3P__V_PK_MUL_F32 + + class Inst_VOP3P__V_PK_ADD_F32 : public Inst_VOP3P + { + public: + Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P*); + ~Inst_VOP3P__V_PK_ADD_F32(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 2; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: // src0 + return 8; + case 1: // src1 + return 8; + case 2: // dst + return 8; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP3P__V_PK_ADD_F32 + class Inst_VOP3P__V_PK_MOV_B32 : public Inst_VOP3P { public: