From de11daec5f02db103064fce24a8be4d2d03918be Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Tue, 14 May 2024 08:14:41 -0700
Subject: [PATCH] arch-vega: Implement F32 <-> F16 conversions

These instructions are used in some of the F16 MFMA example applications
to convert to/from floating point types.

Change-Id: I7426ea663ce11a39fe8c60c8006d8cca11cfaf07
---
 src/arch/amdgpu/vega/insts/op_encodings.hh | 23 ++++++++
 src/arch/amdgpu/vega/insts/vop1.cc         | 32 ++++++++++-
 src/arch/amdgpu/vega/insts/vop3.cc         | 66 +++++++++++++++++++++-
 3 files changed, 117 insertions(+), 4 deletions(-)
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh
index 5861f296ff..3c5804526a 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -455,6 +455,29 @@ namespace VegaISA
         // second instruction DWORD
         InFmt_VOP3_1 extData;
 
+        // Output modifier for VOP3 instructions. This 2-bit field can be set
+        // to "0" to do nothing, "1" to multiply output value by 2, "2" to
+        // multiply output value by 4, or "3" to divide output value by 2. If
+        // the instruction supports clamping, this is applied *before* clamp
+        // but after the abs and neg modifiers.
+        template<typename T>
+        T omodModifier(T val, unsigned omod)
+        {
+            assert(omod < 4);
+
+            if constexpr (std::is_floating_point_v<T>) {
+                if (omod == 1) return val * T(2.0f);
+                if (omod == 2) return val * T(4.0f);
+                if (omod == 3) return val / T(2.0f);
+            } else {
+                assert(std::is_integral_v<T>);
+                if (omod == 1) return val * T(2);
+                if (omod == 2) return val * T(4);
+                if (omod == 3) return val / T(2);
+            }
+
+            return val;
+        }
       private:
         bool hasSecondDword(InFmt_VOP3A *);
         /**
diff --git a/src/arch/amdgpu/vega/insts/vop1.cc b/src/arch/amdgpu/vega/insts/vop1.cc
index 1bd49653ab..3bbf1e0085 100644
--- a/src/arch/amdgpu/vega/insts/vop1.cc
+++ b/src/arch/amdgpu/vega/insts/vop1.cc
@@ -433,7 +433,22 @@ namespace VegaISA
     void
     Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
     {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                float tmp = src[lane];
+                AMDGPU::mxfloat16 out(tmp);
+
+                vdst[lane] = (out.data >> 16);
+            }
+        }
+
+        vdst.write();
     } // execute
     // --- Inst_VOP1__V_CVT_F32_F16 class methods ---
 
@@ -454,7 +469,20 @@ namespace VegaISA
     void
     Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
     {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                AMDGPU::mxfloat16 tmp(src[lane]);
+                vdst[lane] = float(tmp);
+            }
+        }
+
+        vdst.write();
     } // execute
     // --- Inst_VOP1__V_CVT_RPI_I32_F32 class methods ---
 
diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc
index 921cd18c26..47665ad353 100644
--- a/src/arch/amdgpu/vega/insts/vop3.cc
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -2790,7 +2790,42 @@ namespace VegaISA
     void
     Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
     {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        vdst.read();
+
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
+        unsigned abs = instData.ABS;
+        unsigned neg = extData.NEG;
+        int opsel = instData.OPSEL;
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                float tmp = src0[lane];
+
+                if ((abs & 1) && (tmp < 0)) tmp = -tmp;
+                if (neg & 1) tmp = -tmp;
+
+                tmp = omodModifier(tmp, extData.OMOD);
+                tmp = std::clamp(tmp, 0.0f, 1.0f);
+
+                AMDGPU::mxfloat16 out(tmp);
+
+                // If opsel[3] use upper 16-bits of dest, otherwise lower.
+                if (opsel & 8) {
+                    replaceBits(vdst[lane], 31, 16, (out.data >> 16));
+                } else {
+                    replaceBits(vdst[lane], 15, 0, (out.data >> 16));
+                }
+            }
+        }
+
+        vdst.write();
     } // execute
     // --- Inst_VOP3__V_CVT_F32_F16 class methods ---
 
@@ -2811,7 +2846,34 @@ namespace VegaISA
     void
     Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
     {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+        panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
+
+        unsigned abs = instData.ABS;
+        unsigned neg = extData.NEG;
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                AMDGPU::mxfloat16 tmp(src0[lane]);
+
+                if ((abs & 1) && (tmp < 0)) tmp = -tmp;
+                if (neg & 1) tmp = -tmp;
+
+                float out = omodModifier(float(tmp), extData.OMOD);
+                out = std::clamp(out, 0.0f, 1.0f);
+
+                vdst[lane] = out;
+            }
+        }
+
+        vdst.write();
     } // execute
     // --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---