diff --git a/src/arch/amdgpu/vega/decoder.cc b/src/arch/amdgpu/vega/decoder.cc index 27474d5109..5e2402a4af 100644 --- a/src/arch/amdgpu/vega/decoder.cc +++ b/src/arch/amdgpu/vega/decoder.cc @@ -3614,15 +3614,15 @@ namespace VegaISA &Decoder::decode_OP_VOP3P__V_MAD_MIX_F32, &Decoder::decode_OP_VOP3P__V_MAD_MIXLO_F16, &Decoder::decode_OP_VOP3P__V_MAD_MIXHI_F16, + &Decoder::decode_OP_VOP3P__V_DOT2_F32_F16, &Decoder::decode_invalid, &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, + &Decoder::decode_OP_VOP3P__V_DOT2_I32_I16, + &Decoder::decode_OP_VOP3P__V_DOT2_U32_U16, + &Decoder::decode_OP_VOP3P__V_DOT4_I32_I8, + &Decoder::decode_OP_VOP3P__V_DOT4_U32_U8, + &Decoder::decode_OP_VOP3P__V_DOT8_I32_I4, + &Decoder::decode_OP_VOP3P__V_DOT8_U32_U4, &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, @@ -3667,8 +3667,8 @@ namespace VegaISA &Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X16I8, &Decoder::decode_invalid, &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, + &Decoder::decode_OP_VOP3P__V_ACCVGPR_READ, + &Decoder::decode_OP_VOP3P__V_ACCVGPR_WRITE, &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, @@ -13125,6 +13125,48 @@ namespace VegaISA return new Inst_VOP3P__V_PK_MOV_B32(&iFmt->iFmt_VOP3P); } + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_DOT2_F32_F16(MachInst iFmt) + { + return new Inst_VOP3P__V_DOT2_F32_F16(&iFmt->iFmt_VOP3P); + } + + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_DOT2_I32_I16(MachInst iFmt) + { + return new Inst_VOP3P__V_DOT2_I32_I16(&iFmt->iFmt_VOP3P); + } + + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_DOT2_U32_U16(MachInst iFmt) + { + return new Inst_VOP3P__V_DOT2_U32_U16(&iFmt->iFmt_VOP3P); + } + + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_DOT4_I32_I8(MachInst iFmt) + { + return new Inst_VOP3P__V_DOT4_I32_I8(&iFmt->iFmt_VOP3P); + } + + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_DOT4_U32_U8(MachInst iFmt) + { + return new Inst_VOP3P__V_DOT4_U32_U8(&iFmt->iFmt_VOP3P); + } + + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_DOT8_I32_I4(MachInst iFmt) + { + return new Inst_VOP3P__V_DOT8_I32_I4(&iFmt->iFmt_VOP3P); + } + + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_DOT8_U32_U4(MachInst iFmt) + { + return new Inst_VOP3P__V_DOT8_U32_U4(&iFmt->iFmt_VOP3P); + } + GPUStaticInst* Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst iFmt) { @@ -13132,6 +13174,18 @@ namespace VegaISA &iFmt->iFmt_VOP3P_MAI); } + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_ACCVGPR_READ(MachInst iFmt) + { + return new Inst_VOP3P__V_ACCVGPR_READ(&iFmt->iFmt_VOP3P); + } + + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst iFmt) + { + return new Inst_VOP3P__V_ACCVGPR_WRITE(&iFmt->iFmt_VOP3P); + } + GPUStaticInst* Decoder::decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst iFmt) { diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh index 337011cdb8..48084a6913 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.hh +++ b/src/arch/amdgpu/vega/gpu_decoder.hh @@ -1597,7 +1597,16 @@ namespace VegaISA GPUStaticInst* decode_OP_VOP3P__V_PK_MUL_F32(MachInst); GPUStaticInst* decode_OP_VOP3P__V_PK_ADD_F32(MachInst); GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_DOT2_F32_F16(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_DOT2_I32_I16(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_DOT2_U32_U16(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_DOT4_I32_I8(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_DOT4_U32_U8(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_DOT8_I32_I4(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_DOT8_U32_U4(MachInst); GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_READ(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst); GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst); GPUStaticInst* subDecode_OPU_VOP3(MachInst); GPUStaticInst* subDecode_OP_DS(MachInst); diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh index d980eb90bc..9ab7b84974 100644 --- a/src/arch/amdgpu/vega/insts/op_encodings.hh +++ b/src/arch/amdgpu/vega/insts/op_encodings.hh @@ -588,6 +588,56 @@ namespace VegaISA D.write(); } + void + dotHelper(GPUDynInstPtr gpuDynInst, + uint32_t (*fOpImpl)(uint32_t, uint32_t, uint32_t, bool)) + { + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU32 S0(gpuDynInst, extData.SRC0); + ConstVecOperandU32 S1(gpuDynInst, extData.SRC1); + ConstVecOperandU32 S2(gpuDynInst, extData.SRC2); + VecOperandU32 D(gpuDynInst, instData.VDST); + + S0.readSrc(); + S1.readSrc(); + S2.readSrc(); + + // OPSEL[2] and OPSEL_HI2 are unused. Craft two dwords where: + // dword1[15:0] is upper/lower 16b of src0 based on opsel[0] + // dword1[31:15] is upper/lower 16b of src0 based on opsel_hi[0] + // dword2[15:0] is upper/lower 16b of src1 based on opsel[1] + // dword2[31:15] is upper/lower 16b of src1 based on opsel_hi[1] + int opLo = instData.OPSEL; + int opHi = extData.OPSEL_HI; + int negLo = extData.NEG; + int negHi = instData.NEG_HI; + bool clamp = instData.CLMP; + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + uint32_t dword1l = + word(S0[lane], opLo, negLo, 0); + uint32_t dword1h = + word(S0[lane], opHi, negHi, 0); + uint32_t dword2l = + word(S1[lane], opLo, negLo, 1); + uint32_t dword2h = + word(S1[lane], opHi, negHi, 1); + + uint32_t dword1 = (dword1h << 16) | dword1l; + uint32_t dword2 = (dword2h << 16) | dword2l; + + // Take in two uint32_t dwords and one src2 dword. The + // function will need to call bits to break up to the + // correct size and then reinterpret cast to the correct + // value. + D[lane] = fOpImpl(dword1, dword2, S2[lane], clamp); + } + } + + D.write(); + } + private: bool hasSecondDword(InFmt_VOP3P *); diff --git a/src/arch/amdgpu/vega/insts/vop3p.cc b/src/arch/amdgpu/vega/insts/vop3p.cc index a356d3bab3..eddb1e7ad5 100644 --- a/src/arch/amdgpu/vega/insts/vop3p.cc +++ b/src/arch/amdgpu/vega/insts/vop3p.cc @@ -42,6 +42,38 @@ namespace VegaISA using half = uint16_t; // Helper functions +template +int32_t +dotClampI(int32_t value, bool clamp) +{ + // Only valid for N < 32 + static_assert(N < 32); + + if (!clamp) { + return static_cast(value); + } + + int32_t min = -(1 << (N - 1)); + int32_t max = (1 << (N - 1)) - 1; + return std::clamp(value, min, max); +} + +template +uint32_t +dotClampU(uint32_t value, bool clamp) +{ + // Only valid for N < 32 + static_assert(N < 32); + + if (!clamp) { + return static_cast(value); + } + + uint32_t min = 0; + uint32_t max = (1 << N) - 1; + return std::clamp(value, min, max); +} + int16_t clampI16(int32_t value, bool clamp) { @@ -83,6 +115,16 @@ clampF16(uint16_t value, bool clamp) return fplibMax(imm, zero, fpscr2); } +float +clampF32(float value, bool clamp) +{ + if (!clamp) { + return value; + } + + return std::clamp(value, 0.0f, 1.0f); +} + @@ -298,5 +340,296 @@ void Inst_VOP3P__V_PK_MAX_F16::execute(GPUDynInstPtr gpuDynInst) vop3pHelper(gpuDynInst, opImpl); } +void Inst_VOP3P__V_DOT2_F32_F16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = + [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t + { + constexpr unsigned INBITS = 16; + + constexpr unsigned elems = 32 / INBITS; + half S0[elems]; + half S1[elems]; + + for (int i = 0; i < elems; ++i) { + S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS); + S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS); + } + + float S2 = *reinterpret_cast(&S2r); + + // Compute components individually to prevent overflow across packing + half C[elems]; + float Csum = 0.0f; + + for (int i = 0; i < elems; ++i) { + ArmISA::FPSCR fpscr; + C[i] = fplibMul(S0[i], S1[i], fpscr); + uint32_t conv = + ArmISA::fplibConvert( + C[i], ArmISA::FPRounding_TIEEVEN, fpscr); + Csum += clampF32(*reinterpret_cast(&conv), clamp); + } + + Csum += S2; + uint32_t rv = *reinterpret_cast(&Csum); + + return rv; + }; + + dotHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_DOT2_I32_I16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = + [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t + { + constexpr unsigned INBITS = 16; + + constexpr unsigned elems = 32 / INBITS; + uint32_t S0[elems]; + uint32_t S1[elems]; + + for (int i = 0; i < elems; ++i) { + S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS); + S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS); + } + + int32_t S2 = *reinterpret_cast(&S2r); + + // Compute components individually to prevent overflow across packing + int32_t C[elems]; + int32_t Csum = 0; + + for (int i = 0; i < elems; ++i) { + C[i] = sext(S0[i]) * sext(S1[i]); + C[i] = sext(dotClampI(C[i], clamp) & mask(INBITS)); + Csum += C[i]; + } + + Csum += S2; + uint32_t rv = *reinterpret_cast(&Csum); + + return rv; + }; + + dotHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_DOT2_U32_U16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = + [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t + { + constexpr unsigned INBITS = 16; + + constexpr unsigned elems = 32 / INBITS; + uint32_t S0[elems]; + uint32_t S1[elems]; + + for (int i = 0; i < elems; ++i) { + S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS); + S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS); + } + + // Compute components individually to prevent overflow across packing + uint32_t C[elems]; + uint32_t Csum = 0; + + for (int i = 0; i < elems; ++i) { + C[i] = S0[i] * S1[i]; + C[i] = dotClampU(C[i], clamp); + Csum += C[i]; + } + + Csum += S2; + + return Csum; + }; + + dotHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_DOT4_I32_I8::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = + [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t + { + constexpr unsigned INBITS = 8; + + constexpr unsigned elems = 32 / INBITS; + uint32_t S0[elems]; + uint32_t S1[elems]; + + for (int i = 0; i < elems; ++i) { + S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS); + S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS); + } + + int32_t S2 = *reinterpret_cast(&S2r); + + // Compute components individually to prevent overflow across packing + int32_t C[elems]; + int32_t Csum = 0; + + for (int i = 0; i < elems; ++i) { + C[i] = sext(S0[i]) * sext(S1[i]); + C[i] = sext(dotClampI(C[i], clamp) & mask(INBITS)); + Csum += C[i]; + } + + Csum += S2; + uint32_t rv = *reinterpret_cast(&Csum); + + return rv; + }; + + dotHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_DOT4_U32_U8::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = + [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t + { + constexpr unsigned INBITS = 8; + + constexpr unsigned elems = 32 / INBITS; + uint32_t S0[elems]; + uint32_t S1[elems]; + + for (int i = 0; i < elems; ++i) { + S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS); + S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS); + } + + // Compute components individually to prevent overflow across packing + uint32_t C[elems]; + uint32_t Csum = 0; + + for (int i = 0; i < elems; ++i) { + C[i] = S0[i] * S1[i]; + C[i] = dotClampU(C[i], clamp); + Csum += C[i]; + } + + Csum += S2; + + return Csum; + }; + + dotHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_DOT8_I32_I4::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = + [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t + { + constexpr unsigned INBITS = 4; + + constexpr unsigned elems = 32 / INBITS; + uint32_t S0[elems]; + uint32_t S1[elems]; + + for (int i = 0; i < elems; ++i) { + S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS); + S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS); + } + + int32_t S2 = *reinterpret_cast(&S2r); + + // Compute components individually to prevent overflow across packing + int32_t C[elems]; + int32_t Csum = 0; + + for (int i = 0; i < elems; ++i) { + C[i] = sext(S0[i]) * sext(S1[i]); + C[i] = sext(dotClampI(C[i], clamp) & mask(INBITS)); + Csum += C[i]; + } + + Csum += S2; + uint32_t rv = *reinterpret_cast(&Csum); + + return rv; + }; + + dotHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_DOT8_U32_U4::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = + [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t + { + constexpr unsigned INBITS = 4; + + constexpr unsigned elems = 32 / INBITS; + uint32_t S0[elems]; + uint32_t S1[elems]; + + for (int i = 0; i < elems; ++i) { + S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS); + S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS); + } + + // Compute components individually to prevent overflow across packing + uint32_t C[elems]; + uint32_t Csum = 0; + + for (int i = 0; i < elems; ++i) { + C[i] = S0[i] * S1[i]; + C[i] = dotClampU(C[i], clamp); + Csum += C[i]; + } + + Csum += S2; + + return Csum; + }; + + dotHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst) +{ + // The Acc register file is not supported in gem5 and has been removed + // in MI200. Therefore this instruction becomes a mov. + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU32 src(gpuDynInst, extData.SRC0); + VecOperandU32 vdst(gpuDynInst, instData.VDST); + + src.readSrc(); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src[lane]; + } + } + + vdst.write(); +} + +void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst) +{ + // The Acc register file is not supported in gem5 and has been removed + // in MI200. Therefore this instruction becomes a mov. + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU32 src(gpuDynInst, extData.SRC0); + VecOperandU32 vdst(gpuDynInst, instData.VDST); + + src.readSrc(); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src[lane]; + } + } + + vdst.write(); +} + } // namespace VegaISA } // namespace gem5 diff --git a/src/arch/amdgpu/vega/insts/vop3p.hh b/src/arch/amdgpu/vega/insts/vop3p.hh index 56f0f80435..fbb81f12f7 100644 --- a/src/arch/amdgpu/vega/insts/vop3p.hh +++ b/src/arch/amdgpu/vega/insts/vop3p.hh @@ -42,6 +42,41 @@ namespace gem5 namespace VegaISA { + // One source operand + class Inst_VOP3P__1OP : public Inst_VOP3P + { + public: + Inst_VOP3P__1OP(InFmt_VOP3P *iFmt, const std::string& name) + : Inst_VOP3P(iFmt, name) + { + setFlag(ALU); + } + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 1; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: // src + return 4; + case 1: // dst + return 4; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } + + }; + // Two source operands with two 16-bit values in a dword class Inst_VOP3P__2OP_X16 : public Inst_VOP3P { @@ -310,6 +345,96 @@ namespace VegaISA void execute(GPUDynInstPtr gpuDynInst) override; }; + + class Inst_VOP3P__V_DOT2_F32_F16 : public Inst_VOP3P__3OP_X16 + { + public: + Inst_VOP3P__V_DOT2_F32_F16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_f32_f16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_DOT2_I32_I16 : public Inst_VOP3P__3OP_X16 + { + public: + Inst_VOP3P__V_DOT2_I32_I16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_i32_i16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_DOT2_U32_U16 : public Inst_VOP3P__3OP_X16 + { + public: + Inst_VOP3P__V_DOT2_U32_U16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_u32_u16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_DOT4_I32_I8 : public Inst_VOP3P__3OP_X16 + { + public: + Inst_VOP3P__V_DOT4_I32_I8(InFmt_VOP3P *iFmt) + : Inst_VOP3P__3OP_X16(iFmt, "v_dot4_i32_i8") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_DOT4_U32_U8 : public Inst_VOP3P__3OP_X16 + { + public: + Inst_VOP3P__V_DOT4_U32_U8(InFmt_VOP3P *iFmt) + : Inst_VOP3P__3OP_X16(iFmt, "v_dot4_u32_u8") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_DOT8_I32_I4 : public Inst_VOP3P__3OP_X16 + { + public: + Inst_VOP3P__V_DOT8_I32_I4(InFmt_VOP3P *iFmt) + : Inst_VOP3P__3OP_X16(iFmt, "v_dot8_i32_i4") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_DOT8_U32_U4 : public Inst_VOP3P__3OP_X16 + { + public: + Inst_VOP3P__V_DOT8_U32_U4(InFmt_VOP3P *iFmt) + : Inst_VOP3P__3OP_X16(iFmt, "v_dot8_u32_u4") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_ACCVGPR_READ : public Inst_VOP3P__1OP + { + public: + Inst_VOP3P__V_ACCVGPR_READ(InFmt_VOP3P *iFmt) + : Inst_VOP3P__1OP(iFmt, "v_accvgpr_read") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_ACCVGPR_WRITE : public Inst_VOP3P__1OP + { + public: + Inst_VOP3P__V_ACCVGPR_WRITE(InFmt_VOP3P *iFmt) + : Inst_VOP3P__1OP(iFmt, "v_accvgpr_write") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; } // namespace VegaISA } // namespace gem5