arch-vega: Implement F32 <-> F16 conversions

These instructions are used in some of the F16 MFMA example applications to convert to/from floating point types. Change-Id: I7426ea663ce11a39fe8c60c8006d8cca11cfaf07
2024-05-14 08:14:41 -07:00
parent a062229ac3
commit de11daec5f
3 changed files with 117 additions and 4 deletions
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -455,6 +455,29 @@ namespace VegaISA
        // second instruction DWORD
        InFmt_VOP3_1 extData;

+        // Output modifier for VOP3 instructions. This 2-bit field can be set
+        // to "0" to do nothing, "1" to multiply output value by 2, "2" to
+        // multiply output value by 4, or "3" to divide output value by 2. If
+        // the instruction supports clamping, this is applied *before* clamp
+        // but after the abs and neg modifiers.
+        template<typename T>
+        T omodModifier(T val, unsigned omod)
+        {
+            assert(omod < 4);
+
+            if constexpr (std::is_floating_point_v<T>) {
+                if (omod == 1) return val * T(2.0f);
+                if (omod == 2) return val * T(4.0f);
+                if (omod == 3) return val / T(2.0f);
+            } else {
+                assert(std::is_integral_v<T>);
+                if (omod == 1) return val * T(2);
+                if (omod == 2) return val * T(4);
+                if (omod == 3) return val / T(2);
+            }
+
+            return val;
+        }
      private:
        bool hasSecondDword(InFmt_VOP3A *);
        /**
--- a/src/arch/amdgpu/vega/insts/vop1.cc
+++ b/src/arch/amdgpu/vega/insts/vop1.cc
@@ -433,7 +433,22 @@ namespace VegaISA
    void
    Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
    {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                float tmp = src[lane];
+                AMDGPU::mxfloat16 out(tmp);
+
+                vdst[lane] = (out.data >> 16);
+            }
+        }
+
+        vdst.write();
    } // execute
    // --- Inst_VOP1__V_CVT_F32_F16 class methods ---

@@ -454,7 +469,20 @@ namespace VegaISA
    void
    Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
    {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                AMDGPU::mxfloat16 tmp(src[lane]);
+                vdst[lane] = float(tmp);
+            }
+        }
+
+        vdst.write();
    } // execute
    // --- Inst_VOP1__V_CVT_RPI_I32_F32 class methods ---

--- a/src/arch/amdgpu/vega/insts/vop3.cc
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -2790,7 +2790,42 @@ namespace VegaISA
    void
    Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
    {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        vdst.read();
+
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
+        unsigned abs = instData.ABS;
+        unsigned neg = extData.NEG;
+        int opsel = instData.OPSEL;
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                float tmp = src0[lane];
+
+                if ((abs & 1) && (tmp < 0)) tmp = -tmp;
+                if (neg & 1) tmp = -tmp;
+
+                tmp = omodModifier(tmp, extData.OMOD);
+                tmp = std::clamp(tmp, 0.0f, 1.0f);
+
+                AMDGPU::mxfloat16 out(tmp);
+
+                // If opsel[3] use upper 16-bits of dest, otherwise lower.
+                if (opsel & 8) {
+                    replaceBits(vdst[lane], 31, 16, (out.data >> 16));
+                } else {
+                    replaceBits(vdst[lane], 15, 0, (out.data >> 16));
+                }
+            }
+        }
+
+        vdst.write();
    } // execute
    // --- Inst_VOP3__V_CVT_F32_F16 class methods ---

@@ -2811,7 +2846,34 @@ namespace VegaISA
    void
    Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
    {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+        panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
+
+        unsigned abs = instData.ABS;
+        unsigned neg = extData.NEG;
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                AMDGPU::mxfloat16 tmp(src0[lane]);
+
+                if ((abs & 1) && (tmp < 0)) tmp = -tmp;
+                if (neg & 1) tmp = -tmp;
+
+                float out = omodModifier(float(tmp), extData.OMOD);
+                out = std::clamp(out, 0.0f, 1.0f);
+
+                vdst[lane] = out;
+            }
+        }
+
+        vdst.write();
    } // execute
    // --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---