From de11daec5f02db103064fce24a8be4d2d03918be Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 14 May 2024 08:14:41 -0700 Subject: [PATCH] arch-vega: Implement F32 <-> F16 conversions These instructions are used in some of the F16 MFMA example applications to convert to/from floating point types. Change-Id: I7426ea663ce11a39fe8c60c8006d8cca11cfaf07 --- src/arch/amdgpu/vega/insts/op_encodings.hh | 23 ++++++++ src/arch/amdgpu/vega/insts/vop1.cc | 32 ++++++++++- src/arch/amdgpu/vega/insts/vop3.cc | 66 +++++++++++++++++++++- 3 files changed, 117 insertions(+), 4 deletions(-) diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh index 5861f296ff..3c5804526a 100644 --- a/src/arch/amdgpu/vega/insts/op_encodings.hh +++ b/src/arch/amdgpu/vega/insts/op_encodings.hh @@ -455,6 +455,29 @@ namespace VegaISA // second instruction DWORD InFmt_VOP3_1 extData; + // Output modifier for VOP3 instructions. This 2-bit field can be set + // to "0" to do nothing, "1" to multiply output value by 2, "2" to + // multiply output value by 4, or "3" to divide output value by 2. If + // the instruction supports clamping, this is applied *before* clamp + // but after the abs and neg modifiers. + template + T omodModifier(T val, unsigned omod) + { + assert(omod < 4); + + if constexpr (std::is_floating_point_v) { + if (omod == 1) return val * T(2.0f); + if (omod == 2) return val * T(4.0f); + if (omod == 3) return val / T(2.0f); + } else { + assert(std::is_integral_v); + if (omod == 1) return val * T(2); + if (omod == 2) return val * T(4); + if (omod == 3) return val / T(2); + } + + return val; + } private: bool hasSecondDword(InFmt_VOP3A *); /** diff --git a/src/arch/amdgpu/vega/insts/vop1.cc b/src/arch/amdgpu/vega/insts/vop1.cc index 1bd49653ab..3bbf1e0085 100644 --- a/src/arch/amdgpu/vega/insts/vop1.cc +++ b/src/arch/amdgpu/vega/insts/vop1.cc @@ -433,7 +433,22 @@ namespace VegaISA void Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandF32 src(gpuDynInst, instData.SRC0); + VecOperandU32 vdst(gpuDynInst, instData.VDST); + + src.readSrc(); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + float tmp = src[lane]; + AMDGPU::mxfloat16 out(tmp); + + vdst[lane] = (out.data >> 16); + } + } + + vdst.write(); } // execute // --- Inst_VOP1__V_CVT_F32_F16 class methods --- @@ -454,7 +469,20 @@ namespace VegaISA void Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU32 src(gpuDynInst, instData.SRC0); + VecOperandF32 vdst(gpuDynInst, instData.VDST); + + src.readSrc(); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + AMDGPU::mxfloat16 tmp(src[lane]); + vdst[lane] = float(tmp); + } + } + + vdst.write(); } // execute // --- Inst_VOP1__V_CVT_RPI_I32_F32 class methods --- diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc index 921cd18c26..47665ad353 100644 --- a/src/arch/amdgpu/vega/insts/vop3.cc +++ b/src/arch/amdgpu/vega/insts/vop3.cc @@ -2790,7 +2790,42 @@ namespace VegaISA void Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandF32 src0(gpuDynInst, extData.SRC0); + VecOperandU32 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + vdst.read(); + + panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode); + panic_if(isDPPInst(), "DPP not implemented for %s", _opcode); + + unsigned abs = instData.ABS; + unsigned neg = extData.NEG; + int opsel = instData.OPSEL; + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + float tmp = src0[lane]; + + if ((abs & 1) && (tmp < 0)) tmp = -tmp; + if (neg & 1) tmp = -tmp; + + tmp = omodModifier(tmp, extData.OMOD); + tmp = std::clamp(tmp, 0.0f, 1.0f); + + AMDGPU::mxfloat16 out(tmp); + + // If opsel[3] use upper 16-bits of dest, otherwise lower. + if (opsel & 8) { + replaceBits(vdst[lane], 31, 16, (out.data >> 16)); + } else { + replaceBits(vdst[lane], 15, 0, (out.data >> 16)); + } + } + } + + vdst.write(); } // execute // --- Inst_VOP3__V_CVT_F32_F16 class methods --- @@ -2811,7 +2846,34 @@ namespace VegaISA void Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU32 src0(gpuDynInst, extData.SRC0); + VecOperandF32 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + + panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode); + panic_if(isDPPInst(), "DPP not implemented for %s", _opcode); + panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode); + + unsigned abs = instData.ABS; + unsigned neg = extData.NEG; + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + AMDGPU::mxfloat16 tmp(src0[lane]); + + if ((abs & 1) && (tmp < 0)) tmp = -tmp; + if (neg & 1) tmp = -tmp; + + float out = omodModifier(float(tmp), extData.OMOD); + out = std::clamp(out, 0.0f, 1.0f); + + vdst[lane] = out; + } + } + + vdst.write(); } // execute // --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---