From 2bb62a05e13d5e5ad3381ccee84323792ef74bb6 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Tue, 7 May 2024 17:21:13 -0700
Subject: [PATCH] arch-vega: Implement v_cvt_pk_fp8_f32

This instruction serves as a test for the MXFP8 type.

Change-Id: I2ce30bf7f3a3ecc850a445aebdf971c37c39a79e
---
 src/arch/amdgpu/vega/gpu_decoder.cc        |  8 +++-
 src/arch/amdgpu/vega/gpu_decoder.hh        |  1 +
 src/arch/amdgpu/vega/insts/instructions.hh | 33 +++++++++++++
 src/arch/amdgpu/vega/insts/vop3.cc         | 56 ++++++++++++++++++++++
 4 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index 97a22b6e37..45ad5c5af8 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -1245,7 +1245,7 @@ namespace VegaISA
         &Decoder::decode_OPU_VOP3__V_SUB_I16,
         &Decoder::decode_OPU_VOP3__V_PACK_B32_F16,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OPU_VOP3__V_CVT_PK_FP8_F32,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -7295,6 +7295,12 @@ namespace VegaISA
         return nullptr;
     }
 
+    GPUStaticInst*
+    Decoder::decode_OPU_VOP3__V_CVT_PK_FP8_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3__V_CVT_PK_FP8_F32(&iFmt->iFmt_VOP3A);
+    }
+
     GPUStaticInst*
     Decoder::decode_OP_DS__DS_ADD_U32(MachInst iFmt)
     {
diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh
index 9dfd0e7c81..09163d1007 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -509,6 +509,7 @@ namespace VegaISA
         GPUStaticInst* decode_OPU_VOP3__V_ADD_I16(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_SUB_I16(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_PACK_B32_F16(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_CVT_PK_FP8_F32(MachInst);
         GPUStaticInst* decode_OP_DS__DS_ADD_U32(MachInst);
         GPUStaticInst* decode_OP_DS__DS_SUB_U32(MachInst);
         GPUStaticInst* decode_OP_DS__DS_RSUB_U32(MachInst);
diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh
index f4e93303cd..5f5a2a404e 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -44145,6 +44145,39 @@ namespace VegaISA
         Inst_VOP3P_MAI__V_MFMA<2, 16, 16, 4, 1, ConstVecOperandF64,
                                VecOperandF64>;
 
+    class Inst_VOP3__V_CVT_PK_FP8_F32 : public Inst_VOP3A
+    {
+      public:
+        Inst_VOP3__V_CVT_PK_FP8_F32(InFmt_VOP3A*);
+        ~Inst_VOP3__V_CVT_PK_FP8_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3__V_CVT_PK_FP8_F32
 } // namespace VegaISA
 } // namespace gem5
 
diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc
index 18446d2e2b..921cd18c26 100644
--- a/src/arch/amdgpu/vega/insts/vop3.cc
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -29,6 +29,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "arch/amdgpu/common/dtype/mxfp_types.hh"
 #include "arch/amdgpu/vega/insts/inst_util.hh"
 #include "arch/amdgpu/vega/insts/instructions.hh"
 
@@ -8920,5 +8921,60 @@ namespace VegaISA
     {
         panicUnimplemented();
     } // execute
+    // --- Inst_VOP3__V_CVT_PK_FP8_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PK_FP8_F32::Inst_VOP3__V_CVT_PK_FP8_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pk_fp8_f32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CVT_PK_FP8_F32
+
+    Inst_VOP3__V_CVT_PK_FP8_F32::~Inst_VOP3__V_CVT_PK_FP8_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PK_FP8_F32
+
+    void
+    Inst_VOP3__V_CVT_PK_FP8_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vdst.read(); // Preserve bits
+
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+        panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
+        panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
+
+        unsigned opsel = instData.OPSEL;
+        unsigned abs = instData.ABS;
+        unsigned neg = extData.NEG;
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                AMDGPU::mxfloat8 tmp0(src0[lane]), tmp1(src1[lane]);
+
+                if ((abs & 1) && (tmp0 < 0)) tmp0 = -tmp0;
+                if ((abs & 2) && (tmp1 < 0)) tmp1 = -tmp1;
+                if (neg & 1) tmp0 = -tmp0;
+                if (neg & 2) tmp1 = -tmp1;
+
+                uint16_t packed_data = (bits(tmp0.data, 31, 24) << 8)
+                                     | bits(tmp1.data, 31, 24);
+
+                if (opsel & 8) {
+                    replaceBits(vdst[lane], 31, 16, packed_data);
+                } else {
+                    replaceBits(vdst[lane], 15, 0, packed_data);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
 } // namespace VegaISA
 } // namespace gem5