From 420cda1befd073b17d4f4ccfc639cd5e4df9bf2b Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 8 Dec 2023 12:12:16 -0600
Subject: [PATCH] arch-vega: Implement FP32 packed math

Starting with MI200, packed math can operate on double dword inputs. In
this case, 64-bits of inputs (two VGPRs per lane) contain two FP32
values.

Add instructions to perform add, multiply, and FMA on packed FP32 types.

Change-Id: Ib838bff91a10e02e013cc7c33ec3d91ff08647b0
---
 src/arch/amdgpu/vega/decoder.cc            |  36 ++-
 src/arch/amdgpu/vega/gpu_decoder.hh        |   3 +
 src/arch/amdgpu/vega/insts/instructions.cc | 279 +++++++++++++++++++++
 src/arch/amdgpu/vega/insts/instructions.hh | 175 +++++++++++++
 4 files changed, 482 insertions(+), 11 deletions(-)

diff --git a/src/arch/amdgpu/vega/decoder.cc b/src/arch/amdgpu/vega/decoder.cc
index e7bea7c33b..27474d5109 100644
--- a/src/arch/amdgpu/vega/decoder.cc
+++ b/src/arch/amdgpu/vega/decoder.cc
@@ -3627,9 +3627,9 @@ namespace VegaISA
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_PK_FMA_F32,
+        &Decoder::decode_OP_VOP3P__V_PK_MUL_F32,
+        &Decoder::decode_OP_VOP3P__V_PK_ADD_F32,
         &Decoder::decode_OP_VOP3P__V_PK_MOV_B32,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -4203,8 +4203,7 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_VOP2__V_FMAC_F32(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP2__V_FMAC_F32(&iFmt->iFmt_VOP2);
     }
 
     GPUStaticInst*
@@ -8293,8 +8292,7 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_FLAT__FLAT_STORE_SHORT_D16_HI(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
@@ -8607,8 +8605,7 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_GLOBAL__GLOBAL_STORE_SHORT_D16_HI(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
@@ -9968,8 +9965,7 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_SHORT_D16_HI(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
@@ -13105,6 +13101,24 @@ namespace VegaISA
         return nullptr;
     }
 
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_PK_FMA_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_PK_FMA_F32(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_PK_MUL_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_PK_MUL_F32(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_PK_ADD_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_PK_ADD_F32(&iFmt->iFmt_VOP3P);
+    }
+
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MOV_B32(MachInst iFmt)
     {
diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh
index 4ebb95e5f4..337011cdb8 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -1593,6 +1593,9 @@ namespace VegaISA
         GPUStaticInst* decode_OP_VOP3P__V_MAD_MIX_F32(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXLO_F16(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXHI_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_PK_FMA_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_PK_MUL_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_PK_ADD_F32(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst);
diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc
index 9104f29228..5f951f860e 100644
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
@@ -8129,6 +8129,40 @@ namespace VegaISA
 
         vdst.write();
     } // execute
+    // --- Inst_VOP2__V_FMAC_F32 class methods ---
+
+    Inst_VOP2__V_FMAC_F32::Inst_VOP2__V_FMAC_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_fmac_f32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_FMAC_F32
+
+    Inst_VOP2__V_FMAC_F32::~Inst_VOP2__V_FMAC_F32()
+    {
+    } // ~Inst_VOP2__V_FMAC_F32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    void
+    Inst_VOP2__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+        vdst.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
     // --- Inst_VOP1__V_NOP class methods ---
 
     Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt)
@@ -44497,6 +44531,66 @@ namespace VegaISA
     Inst_FLAT__FLAT_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_SHORT_D16_HI class methods ---
+
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::
+        Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_short_d16_hi")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_SHORT_D16_HI
+
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::~Inst_FLAT__FLAT_STORE_SHORT_D16_HI()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_SHORT_D16_HI
+
+    // --- description from .arch file ---
+    // Untyped buffer store short.
+    void
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
+                    = (data[lane] >> 16);
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
     // --- Inst_FLAT__FLAT_STORE_DWORD class methods ---
 
     Inst_FLAT__FLAT_STORE_DWORD::Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT *iFmt)
@@ -45995,6 +46089,191 @@ namespace VegaISA
     {
         atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
     } // completeAcc
+    // --- Inst_VOP3P__V_PK_FMA_F32 class methods ---
+
+    Inst_VOP3P__V_PK_FMA_F32::Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *iFmt)
+        : Inst_VOP3P(iFmt, "v_pk_fma_f32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP3P__V_PK_FMA_F32
+
+    Inst_VOP3P__V_PK_FMA_F32::~Inst_VOP3P__V_PK_FMA_F32()
+    {
+    } // ~Inst_VOP3P__V_PK_FMA_F32
+
+    // D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] =
+    //     S0.f[31:0] * S1.f[31:0] + S2.f[31:0] .
+    void
+    Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        // This is a special case of packed instructions which operates on
+        // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+        // values cannot use bitwise operations. Consider the U64 to imply
+        // untyped 64-bits of data.
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        int opsel = instData.OPSEL;
+        int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                           : bits(src0[lane], 31, 0);
+                uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                           : bits(src1[lane], 31, 0);
+                uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)
+                                           : bits(src2[lane], 31, 0);
+
+                float dword1 = std::fma(*reinterpret_cast<float*>(&s0l),
+                                        *reinterpret_cast<float*>(&s1l),
+                                        *reinterpret_cast<float*>(&s2l));
+
+                uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                              : bits(src0[lane], 31, 0);
+                uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                              : bits(src1[lane], 31, 0);
+                uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)
+                                              : bits(src2[lane], 31, 0);
+
+                float dword2 = std::fma(*reinterpret_cast<float*>(&s0h),
+                                        *reinterpret_cast<float*>(&s1h),
+                                        *reinterpret_cast<float*>(&s2h));
+
+                uint64_t result1 = *reinterpret_cast<uint64_t*>(&dword1);
+                uint64_t result2 = *reinterpret_cast<uint64_t*>(&dword2);
+
+                vdst[lane] = (result2 << 32) | result1;
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3P__V_PK_MUL_F32 class methods ---
+
+    Inst_VOP3P__V_PK_MUL_F32::Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *iFmt)
+        : Inst_VOP3P(iFmt, "v_pk_mul_f32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP3P__V_PK_MUL_F32
+
+    Inst_VOP3P__V_PK_MUL_F32::~Inst_VOP3P__V_PK_MUL_F32()
+    {
+    } // ~Inst_VOP3P__V_PK_MUL_F32
+
+    // D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] *
+    //              S1.f[31:0]
+    void
+    Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        // This is a special case of packed instructions which operates on
+        // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+        // values cannot use bitwise operations. Consider the U64 to imply
+        // untyped 64-bits of data.
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        int opsel = instData.OPSEL;
+        int opsel_hi = extData.OPSEL_HI;
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                                   : bits(src0[lane], 31, 0);
+                uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                                   : bits(src1[lane], 31, 0);
+
+                float dword1 = *reinterpret_cast<float*>(&lower_dword)
+                             * *reinterpret_cast<float*>(&upper_dword);
+
+                lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                             : bits(src0[lane], 31, 0);
+                upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                             : bits(src1[lane], 31, 0);
+
+                float dword2 = *reinterpret_cast<float*>(&lower_dword)
+                             * *reinterpret_cast<float*>(&upper_dword);
+
+                uint64_t result1 = *reinterpret_cast<uint64_t*>(&dword1);
+                uint64_t result2 = *reinterpret_cast<uint64_t*>(&dword2);
+
+                vdst[lane] = (result2 << 32) | result1;
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3P__V_PK_ADD_F32 class methods ---
+
+    Inst_VOP3P__V_PK_ADD_F32::Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *iFmt)
+        : Inst_VOP3P(iFmt, "v_pk_add_f32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP3P__V_PK_ADD_F32
+
+    Inst_VOP3P__V_PK_ADD_F32::~Inst_VOP3P__V_PK_ADD_F32()
+    {
+    } // ~Inst_VOP3P__V_PK_ADD_F32
+
+    // D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] +
+    //              S1.f[31:0]
+    void
+    Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        // This is a special case of packed instructions which operates on
+        // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+        // values cannot use bitwise operations. Consider the U64 to imply
+        // untyped 64-bits of data.
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        int opsel = instData.OPSEL;
+        int opsel_hi = extData.OPSEL_HI;
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                                   : bits(src0[lane], 31, 0);
+                uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                                   : bits(src1[lane], 31, 0);
+
+                float dword1 = *reinterpret_cast<float*>(&lower_dword)
+                             + *reinterpret_cast<float*>(&upper_dword);
+
+                lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                             : bits(src0[lane], 31, 0);
+                upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                             : bits(src1[lane], 31, 0);
+
+                float dword2 = *reinterpret_cast<float*>(&lower_dword)
+                             + *reinterpret_cast<float*>(&upper_dword);
+
+                uint64_t result1 = *reinterpret_cast<uint64_t*>(&dword1);
+                uint64_t result2 = *reinterpret_cast<uint64_t*>(&dword2);
+
+                vdst[lane] = (result2 << 32) | result1;
+            }
+        }
+
+        vdst.write();
+    } // execute
     // --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
 
     Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt)
diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh
index c65f4c1609..db03548a3d 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -8098,6 +8098,40 @@ namespace VegaISA
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP2__V_SUBREV_U32
 
+    class Inst_VOP2__V_FMAC_F32 : public Inst_VOP2
+    {
+      public:
+        Inst_VOP2__V_FMAC_F32(InFmt_VOP2*);
+        ~Inst_VOP2__V_FMAC_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP2__V_FMAC_F32
+
     class Inst_VOP1__V_NOP : public Inst_VOP1
     {
       public:
@@ -42280,6 +42314,43 @@ namespace VegaISA
         void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_STORE_SHORT
 
+    class Inst_FLAT__FLAT_STORE_SHORT_D16_HI : public Inst_FLAT
+    {
+      public:
+        Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT*);
+        ~Inst_FLAT__FLAT_STORE_SHORT_D16_HI();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 0; }
+        int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_addr
+                return vgprIsOffset() ? 4 : 8;
+              case 1: //vgpr_src
+                return 2;
+              case 2: //saddr
+                assert(!isFlat());
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_FLAT__FLAT_STORE_SHORT_D16_HI
+
     class Inst_FLAT__FLAT_STORE_DWORD : public Inst_FLAT
     {
       public:
@@ -43637,6 +43708,110 @@ namespace VegaISA
         void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_MAX_F64
 
+    class Inst_VOP3P__V_PK_FMA_F32 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P*);
+        ~Inst_VOP3P__V_PK_FMA_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 8;
+              case 1: // src1
+                return 8;
+              case 2: // src2
+                return 8;
+              case 3: // dst
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3P__V_PK_FMA_F32
+
+    class Inst_VOP3P__V_PK_MUL_F32 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P*);
+        ~Inst_VOP3P__V_PK_MUL_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 8;
+              case 1: // src1
+                return 8;
+              case 2: // dst
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3P__V_PK_MUL_F32
+
+    class Inst_VOP3P__V_PK_ADD_F32 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P*);
+        ~Inst_VOP3P__V_PK_ADD_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 8;
+              case 1: // src1
+                return 8;
+              case 2: // dst
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3P__V_PK_ADD_F32
+
     class Inst_VOP3P__V_PK_MOV_B32 : public Inst_VOP3P
     {
       public: