arch-vega: Implement v_cvt_pk_fp8_f32

This instruction serves as a test for the MXFP8 type. Change-Id: I2ce30bf7f3a3ecc850a445aebdf971c37c39a79e
2024-05-07 17:21:13 -07:00
parent d420a0a1e7
commit 2bb62a05e1
4 changed files with 97 additions and 1 deletions
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -1245,7 +1245,7 @@ namespace VegaISA
        &Decoder::decode_OPU_VOP3__V_SUB_I16,
        &Decoder::decode_OPU_VOP3__V_PACK_B32_F16,
        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OPU_VOP3__V_CVT_PK_FP8_F32,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
@@ -7295,6 +7295,12 @@ namespace VegaISA
        return nullptr;
    }

+    GPUStaticInst*
+    Decoder::decode_OPU_VOP3__V_CVT_PK_FP8_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3__V_CVT_PK_FP8_F32(&iFmt->iFmt_VOP3A);
+    }
+
    GPUStaticInst*
    Decoder::decode_OP_DS__DS_ADD_U32(MachInst iFmt)
    {
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -509,6 +509,7 @@ namespace VegaISA
        GPUStaticInst* decode_OPU_VOP3__V_ADD_I16(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_SUB_I16(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_PACK_B32_F16(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_CVT_PK_FP8_F32(MachInst);
        GPUStaticInst* decode_OP_DS__DS_ADD_U32(MachInst);
        GPUStaticInst* decode_OP_DS__DS_SUB_U32(MachInst);
        GPUStaticInst* decode_OP_DS__DS_RSUB_U32(MachInst);
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -44145,6 +44145,39 @@ namespace VegaISA
        Inst_VOP3P_MAI__V_MFMA<2, 16, 16, 4, 1, ConstVecOperandF64,
                               VecOperandF64>;

+    class Inst_VOP3__V_CVT_PK_FP8_F32 : public Inst_VOP3A
+    {
+      public:
+        Inst_VOP3__V_CVT_PK_FP8_F32(InFmt_VOP3A*);
+        ~Inst_VOP3__V_CVT_PK_FP8_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3__V_CVT_PK_FP8_F32
 } // namespace VegaISA
 } // namespace gem5

--- a/src/arch/amdgpu/vega/insts/vop3.cc
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -29,6 +29,7 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

+#include "arch/amdgpu/common/dtype/mxfp_types.hh"
 #include "arch/amdgpu/vega/insts/inst_util.hh"
 #include "arch/amdgpu/vega/insts/instructions.hh"

@@ -8920,5 +8921,60 @@ namespace VegaISA
    {
        panicUnimplemented();
    } // execute
+    // --- Inst_VOP3__V_CVT_PK_FP8_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PK_FP8_F32::Inst_VOP3__V_CVT_PK_FP8_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pk_fp8_f32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CVT_PK_FP8_F32
+
+    Inst_VOP3__V_CVT_PK_FP8_F32::~Inst_VOP3__V_CVT_PK_FP8_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PK_FP8_F32
+
+    void
+    Inst_VOP3__V_CVT_PK_FP8_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vdst.read(); // Preserve bits
+
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+        panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
+        panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
+
+        unsigned opsel = instData.OPSEL;
+        unsigned abs = instData.ABS;
+        unsigned neg = extData.NEG;
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                AMDGPU::mxfloat8 tmp0(src0[lane]), tmp1(src1[lane]);
+
+                if ((abs & 1) && (tmp0 < 0)) tmp0 = -tmp0;
+                if ((abs & 2) && (tmp1 < 0)) tmp1 = -tmp1;
+                if (neg & 1) tmp0 = -tmp0;
+                if (neg & 2) tmp1 = -tmp1;
+
+                uint16_t packed_data = (bits(tmp0.data, 31, 24) << 8)
+                                     | bits(tmp1.data, 31, 24);
+
+                if (opsel & 8) {
+                    replaceBits(vdst[lane], 31, 16, packed_data);
+                } else {
+                    replaceBits(vdst[lane], 15, 0, packed_data);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
 } // namespace VegaISA
 } // namespace gem5