arch-vega: Add VOP3P encodings and packed 16b insts

This adds the VOP3P and VOP3P_MAI encodings from the MI200 spec. These instructions are used for packed math and miSIMD instructions. The first 19 VOP3P opcodes are implemented and validated against hardware. This includes all instructions which operate on one dword containing two packed 16-bit values of fp16, int16_t, or uint16_t. Implement one MFMA instruction for now which was also validated against hardware.
2023-12-08 12:34:23 -06:00
parent 4903fe2db1
commit bc69ab0a1f
9 changed files with 1183 additions and 41 deletions
--- a/src/arch/amdgpu/vega/SConscript
+++ b/src/arch/amdgpu/vega/SConscript
@@ -56,4 +56,6 @@ if env['CONF']['TARGET_GPU_ISA'] == 'vega':
    Source('isa.cc')
    Source('registers.cc')

+    Source('insts/vop3p.cc')
+
    DebugFlag('VEGA', 'Debug flag for VEGA GPU ISA')
--- a/src/arch/amdgpu/vega/decoder.cc
+++ b/src/arch/amdgpu/vega/decoder.cc
@@ -34,6 +34,7 @@
 #include "arch/amdgpu/vega/gpu_decoder.hh"
 #include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
 #include "arch/amdgpu/vega/insts/instructions.hh"
+#include "arch/amdgpu/vega/insts/vop3p.hh"

 namespace gem5
 {
@@ -3629,6 +3630,7 @@ namespace VegaISA
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_PK_MOV_B32,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
@@ -3687,8 +3689,7 @@ namespace VegaISA
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F64_16X16X4F64,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
@@ -12920,134 +12921,115 @@ namespace VegaISA
    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_MAD_I16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAD_I16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_MUL_LO_U16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MUL_LO_U16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_ADD_I16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_ADD_I16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_SUB_I16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_SUB_I16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_LSHLREV_B16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_LSHLREV_B16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_LSHRREV_B16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_LSHRREV_B16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_ASHRREV_I16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_ASHRREV_B16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_MAX_I16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAX_I16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_MIN_I16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MIN_I16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_MAD_U16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAD_U16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_ADD_U16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_ADD_U16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_SUB_U16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_SUB_U16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_MAX_U16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAX_U16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_MIN_U16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MIN_U16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_FMA_F16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_FMA_F16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_ADD_F16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_ADD_F16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_MUL_F16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MUL_F16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_MIN_F16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MIN_F16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_PK_MAX_F16(MachInst iFmt)
    {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAX_F16(&iFmt->iFmt_VOP3P);
    }

    GPUStaticInst*
@@ -13071,6 +13053,19 @@ namespace VegaISA
        return nullptr;
    }

+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_PK_MOV_B32(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_PK_MOV_B32(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
    GPUStaticInst*
    Decoder::decode_invalid(MachInst iFmt)
    {
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -1585,6 +1585,8 @@ namespace VegaISA
        GPUStaticInst* decode_OP_VOP3P__V_MAD_MIX_F32(MachInst);
        GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXLO_F16(MachInst);
        GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXHI_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst);
        GPUStaticInst* subDecode_OPU_VOP3(MachInst);
        GPUStaticInst* subDecode_OP_DS(MachInst);
        GPUStaticInst* subDecode_OP_FLAT(MachInst);
@@ -1908,7 +1910,27 @@ namespace VegaISA
        unsigned int       NEG : 3;
    };

-    union InstFormat {
+    struct InFmt_VOP3P_MAI
+    {
+        unsigned int      VDST : 8;
+        unsigned int      CBSZ : 3;
+        unsigned int      ABID : 4;
+        unsigned int    ACC_CD : 1;
+        unsigned int        OP : 7;
+        unsigned int  ENCODING : 9;
+    };
+
+    struct InFmt_VOP3P_MAI_1
+    {
+        unsigned int  SRC0 : 9;
+        unsigned int  SRC1 : 9;
+        unsigned int  SRC2 : 9;
+        unsigned int   ACC : 2;
+        unsigned int  BLGP : 3;
+    };
+
+    union InstFormat
+    {
        InFmt_DS            iFmt_DS;
        InFmt_DS_1          iFmt_DS_1;
        InFmt_EXP           iFmt_EXP;
@@ -1941,6 +1963,8 @@ namespace VegaISA
        InFmt_VOP_SDWAB     iFmt_VOP_SDWAB;
        InFmt_VOP3P         iFmt_VOP3P;
        InFmt_VOP3P_1       iFmt_VOP3P_1;
+        InFmt_VOP3P_MAI     iFmt_VOP3P_MAI;
+        InFmt_VOP3P_MAI_1   iFmt_VOP3P_MAI_1;
        uint32_t            imm_u32;
        float               imm_f32;
    }; // union InstFormat
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
@@ -45908,5 +45908,143 @@ namespace VegaISA
    {
        panicUnimplemented();
    } // execute
+    // --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
+
+    Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt)
+        : Inst_VOP3P(iFmt, "v_pk_mov_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP3P__V_PK_MOV_B32
+
+    Inst_VOP3P__V_PK_MOV_B32::~Inst_VOP3P__V_PK_MOV_B32()
+    {
+    } // ~Inst_VOP3P__V_PK_MOV_B32
+
+    // D.u[63:32] = S1.u[31:0]; D.u[31:0] = S0.u[31:0].
+    void
+    Inst_VOP3P__V_PK_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        // This is a special case of packed instructions which operates on
+        // 64-bit inputs/outputs and not 32-bit.
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        // Only OPSEL[1:0] are used
+        // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0
+
+        int opsel = instData.OPSEL;
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1
+                uint64_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                                   : bits(src0[lane], 31, 0);
+                uint64_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                                   : bits(src1[lane], 31, 0);
+
+                vdst[lane] = upper_dword << 32 | lower_dword;
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 class methods ---
+
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
+        Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(InFmt_VOP3P_MAI *iFmt)
+        : Inst_VOP3P_MAI(iFmt, "v_mfma_f64_16x16x4f64")
+    {
+        setFlag(ALU);
+    } // Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
+
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
+        ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64()
+    {
+    } // ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
+
+    // D(16x16F64) = A(16x4F64) x B(4x16F64) + C(16x16F64), 1 Blocks, 8
+    // pass, srcA/srcB 2 VGPR, srcC/D 8 VGPR
+    void
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        int acc_offset = 0;
+        if (instData.ACC_CD) {
+            warn("ACC_CD not yet implemented\n");
+        }
+
+        // Handling of src2 is a bit tricky. The operator[] overload cannot
+        // be used for dword count > 2, and the dword count here is 8. Usually
+        // src2 is a VGPR/AccGPR, but it might also be constant. In order to
+        // use operator[] and handle constants, check for VGPR here and set
+        // a delta for each of the pairs of src2 GPRs.
+        int delta = isVectorReg(extData.SRC2) ? 2 : 0;
+
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2a(gpuDynInst, extData.SRC2+acc_offset);
+        ConstVecOperandF64 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
+        ConstVecOperandF64 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
+        ConstVecOperandF64 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
+
+        VecOperandF64 vdsta(gpuDynInst, instData.VDST+acc_offset);
+        VecOperandF64 vdstb(gpuDynInst, instData.VDST+acc_offset+2);
+        VecOperandF64 vdstc(gpuDynInst, instData.VDST+acc_offset+4);
+        VecOperandF64 vdstd(gpuDynInst, instData.VDST+acc_offset+6);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2a.readSrc();
+        src2b.readSrc();
+        src2c.readSrc();
+        src2d.readSrc();
+
+        double result[16][16];
+
+        // Load src2 into result. src2 is row major
+        for (int i = 0; i < 64; ++i) {
+            // src2a contains rows 0 - 3
+            result[(i/16)][(i%16)] = src2a[i];
+            // src2b contains rows 4 - 7
+            result[(i/16)+4][(i%16)] = src2b[i];
+            // src2c contains rows 8 - 11
+            result[(i/16)+8][(i%16)] = src2c[i];
+            // src2d contains rows 12 - 15
+            result[(i/16)+12][(i%16)] = src2d[i];
+        }
+
+        // Compute new result
+        for (int i = 0; i < 16; ++i) {
+            for (int j = 0; j < 16; ++j) {
+                for (int k = 0; k < 4; ++k) {
+                    // src0 is column major, src1 is row major
+                    int lane_A = 16*k + i;
+                    int lane_B = 16*k + j;
+                    result[i][j] += src0[lane_A] * src1[lane_B];
+                }
+            }
+        }
+
+        // Put result in dest VGPRs
+        for (int i = 0; i < 64; ++i) {
+            // vdsta contains rows 0 - 3
+            vdsta[i] = result[(i/16)][(i%16)];
+            // src2b contains rows 4 - 7
+            vdstb[i] = result[(i/16)+4][(i%16)];
+            // src2c contains rows 8 - 11
+            vdstc[i] = result[(i/16)+8][(i%16)];
+            // src2d contains rows 12 - 15
+            vdstd[i] = result[(i/16)+12][(i%16)];
+        }
+
+        vdsta.write();
+        vdstb.write();
+        vdstc.write();
+        vdstd.write();
+    } // execute
 } // namespace VegaISA
 } // namespace gem5
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -43405,6 +43405,76 @@ namespace VegaISA

        void execute(GPUDynInstPtr) override;
    }; // Inst_FLAT__FLAT_ATOMIC_DEC_X2
+
+    class Inst_VOP3P__V_PK_MOV_B32 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P*);
+        ~Inst_VOP3P__V_PK_MOV_B32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 8;
+              case 1: // src1
+                return 8;
+              case 2: // dst
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3P__V_PK_MOV_B32
+
+    class Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 : public Inst_VOP3P_MAI
+    {
+      public:
+        Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(InFmt_VOP3P_MAI*);
+        ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0 "A"
+                return 8;
+              case 1: // src1 "B"
+                return 8;
+              case 2: // src2 "C"
+                return 32;
+              case 3: // dst
+                return 32;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    };
 } // namespace VegaISA
 } // namespace gem5

--- a/src/arch/amdgpu/vega/insts/op_encodings.cc
+++ b/src/arch/amdgpu/vega/insts/op_encodings.cc
@@ -1178,6 +1178,158 @@ namespace VegaISA
        disassembly = dis_stream.str();
    }

+    // --- Inst_VOP3P base class methods ---
+
+    Inst_VOP3P::Inst_VOP3P(InFmt_VOP3P *iFmt, const std::string &opcode)
+        : VEGAGPUStaticInst(opcode)
+    {
+        // copy first instruction DWORD
+        instData = iFmt[0];
+        // copy second instruction DWORD
+        extData = ((InFmt_VOP3P_1 *)iFmt)[1];
+    } // Inst_VOP3P
+
+    Inst_VOP3P::~Inst_VOP3P()
+    {
+    } // ~Inst_VOP3P
+
+    void
+    Inst_VOP3P::initOperandInfo()
+    {
+        // Also takes care of bitfield addr issue
+        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
+
+        int opNum = 0;
+
+        int numSrc = numSrcRegOperands();
+
+        for (opNum = 0; opNum < numSrc; opNum++) {
+            srcOps.emplace_back(srcs[opNum], getOperandSize(opNum), true,
+                                  isScalarReg(srcs[opNum]),
+                                  isVectorReg(srcs[opNum]), false);
+        }
+
+        // There is always one dest
+        // Needed because can't take addr of bitfield
+        int reg = instData.VDST;
+        dstOps.emplace_back(reg, getOperandSize(opNum), false,
+                              false, true, false);
+        opNum++;
+
+        assert(srcOps.size() == numSrcRegOperands());
+        assert(dstOps.size() == numDstRegOperands());
+    }
+
+    int
+    Inst_VOP3P::instSize() const
+    {
+        return 8;
+    } // instSize
+
+    void
+    Inst_VOP3P::generateDisassembly()
+    {
+        std::stringstream dis_stream;
+        dis_stream << _opcode << " ";
+
+        // There is always a dest and the index is after the src operands
+        // The output size much be a multiple of dword size
+        int dst_size = getOperandSize(numSrcRegOperands());
+
+        dis_stream << opSelectorToRegSym(instData.VDST + 0x100, dst_size / 4);
+
+        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
+        for (int opnum = 0; opnum < numSrcRegOperands(); opnum++) {
+            int num_regs = getOperandSize(opnum) / 4;
+            dis_stream << ", " << opSelectorToRegSym(srcs[opnum], num_regs);
+        }
+
+        // Print op_sel only if one is non-zero
+        if (instData.OPSEL) {
+            int opsel = instData.OPSEL;
+
+            dis_stream << " op_sel:[" << bits(opsel, 0, 0) << ","
+                    << bits(opsel, 1, 1) << "," << bits(opsel, 2, 2) << "]";
+        }
+
+        disassembly = dis_stream.str();
+    }
+
+    // --- Inst_VOP3P_MAI base class methods ---
+
+    Inst_VOP3P_MAI::Inst_VOP3P_MAI(InFmt_VOP3P_MAI *iFmt,
+                                   const std::string &opcode)
+        : VEGAGPUStaticInst(opcode)
+    {
+        // copy first instruction DWORD
+        instData = iFmt[0];
+        // copy second instruction DWORD
+        extData = ((InFmt_VOP3P_MAI_1 *)iFmt)[1];
+    } // Inst_VOP3P_MAI
+
+    Inst_VOP3P_MAI::~Inst_VOP3P_MAI()
+    {
+    } // ~Inst_VOP3P_MAI
+
+    void
+    Inst_VOP3P_MAI::initOperandInfo()
+    {
+        // Also takes care of bitfield addr issue
+        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
+
+        int opNum = 0;
+
+        int numSrc = numSrcRegOperands();
+
+        for (opNum = 0; opNum < numSrc; opNum++) {
+            srcOps.emplace_back(srcs[opNum], getOperandSize(opNum), true,
+                                  isScalarReg(srcs[opNum]),
+                                  isVectorReg(srcs[opNum]), false);
+        }
+
+        // There is always one dest
+        // Needed because can't take addr of bitfield
+        int reg = instData.VDST;
+        dstOps.emplace_back(reg, getOperandSize(opNum), false,
+                              false, true, false);
+        opNum++;
+
+        assert(srcOps.size() == numSrcRegOperands());
+        assert(dstOps.size() == numDstRegOperands());
+    }
+
+    int
+    Inst_VOP3P_MAI::instSize() const
+    {
+        return 8;
+    } // instSize
+
+    void
+    Inst_VOP3P_MAI::generateDisassembly()
+    {
+        std::stringstream dis_stream;
+        dis_stream << _opcode << " ";
+
+        // There is always a dest and the index is after the src operands
+        // The output size much be a multiple of dword size
+        int dst_size = getOperandSize(numSrcRegOperands());
+
+        // opSelectorToRegSym handles formating for us. VDST is always VGPR
+        // so only the last 8 bits are used. This adds the implicit 9th bit
+        // which is 1 for VGPRs as VGPR op nums are from 256-255.
+        int dst_opnum = instData.VDST + 0x100;
+
+        dis_stream << opSelectorToRegSym(dst_opnum, dst_size / 4);
+
+        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
+        for (int opnum = 0; opnum < numSrcRegOperands(); opnum++) {
+            int num_regs = getOperandSize(opnum) / 4;
+            dis_stream << ", " << opSelectorToRegSym(srcs[opnum], num_regs);
+        }
+
+        disassembly = dis_stream.str();
+    }
+
    // --- Inst_DS base class methods ---

    Inst_DS::Inst_DS(InFmt_DS *iFmt, const std::string &opcode)
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -491,6 +491,149 @@ namespace VegaISA
        bool hasSecondDword(InFmt_VOP3B *);
    }; // Inst_VOP3B

+    class Inst_VOP3P : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_VOP3P(InFmt_VOP3P*, const std::string &opcode);
+        ~Inst_VOP3P();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        void initOperandInfo() override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_VOP3P instData;
+        // second instruction DWORD
+        InFmt_VOP3P_1 extData;
+
+        template<typename T>
+        void vop3pHelper(GPUDynInstPtr gpuDynInst,
+                        T (*fOpImpl)(T, T, bool))
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+            ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
+            ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
+            VecOperandU32 D(gpuDynInst, instData.VDST);
+
+            S0.readSrc();
+            S1.readSrc();
+
+            int opLo = instData.OPSEL;
+            int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
+            int negLo = extData.NEG;
+            int negHi = instData.NEG_HI;
+            bool clamp = instData.CLMP;
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
+                                          word<T>(S1[lane], opHi, negHi, 1),
+                                          clamp);
+                    T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
+                                          word<T>(S1[lane], opLo, negLo, 1),
+                                          clamp);
+
+                    uint16_t upper_raw =
+                        *reinterpret_cast<uint16_t*>(&upper_val);
+                    uint16_t lower_raw =
+                        *reinterpret_cast<uint16_t*>(&lower_val);
+
+                    D[lane] = upper_raw << 16 | lower_raw;
+                }
+            }
+
+            D.write();
+        }
+
+        template<typename T>
+        void vop3pHelper(GPUDynInstPtr gpuDynInst,
+                        T (*fOpImpl)(T, T, T, bool))
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+            ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
+            ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
+            ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
+            VecOperandU32 D(gpuDynInst, instData.VDST);
+
+            S0.readSrc();
+            S1.readSrc();
+            S2.readSrc();
+
+            int opLo = instData.OPSEL;
+            int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
+            int negLo = extData.NEG;
+            int negHi = instData.NEG_HI;
+            bool clamp = instData.CLMP;
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
+                                          word<T>(S1[lane], opHi, negHi, 1),
+                                          word<T>(S2[lane], opHi, negHi, 2),
+                                          clamp);
+                    T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
+                                          word<T>(S1[lane], opLo, negLo, 1),
+                                          word<T>(S2[lane], opLo, negLo, 2),
+                                          clamp);
+
+                    uint16_t upper_raw =
+                        *reinterpret_cast<uint16_t*>(&upper_val);
+                    uint16_t lower_raw =
+                        *reinterpret_cast<uint16_t*>(&lower_val);
+
+                    D[lane] = upper_raw << 16 | lower_raw;
+                }
+            }
+
+            D.write();
+        }
+
+      private:
+        bool hasSecondDword(InFmt_VOP3P *);
+
+        template<typename T>
+        T
+        word(uint32_t data, int opSel, int neg, int opSelBit)
+        {
+            // This method assumes two words packed into a dword
+            static_assert(sizeof(T) == 2);
+
+            bool select = bits(opSel, opSelBit, opSelBit);
+            uint16_t raw = select ? bits(data, 31, 16)
+                                  : bits(data, 15, 0);
+
+            // Apply input modifiers. This may seem odd, but the hardware
+            // just flips the MSb instead of doing unary negation.
+            bool negate = bits(neg, opSelBit, opSelBit);
+            if (negate) {
+                raw ^= 0x8000;
+            }
+
+            return *reinterpret_cast<T*>(&raw);
+        }
+    }; // Inst_VOP3P
+
+    class Inst_VOP3P_MAI : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_VOP3P_MAI(InFmt_VOP3P_MAI*, const std::string &opcode);
+        ~Inst_VOP3P_MAI();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        void initOperandInfo() override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_VOP3P_MAI instData;
+        // second instruction DWORD
+        InFmt_VOP3P_MAI_1 extData;
+
+      private:
+        bool hasSecondDword(InFmt_VOP3P_MAI *);
+    }; // Inst_VOP3P
+
    class Inst_DS : public VEGAGPUStaticInst
    {
      public:
--- a/src/arch/amdgpu/vega/insts/vop3p.cc
+++ b/src/arch/amdgpu/vega/insts/vop3p.cc
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/vop3p.hh"
+
+#include "arch/arm/insts/fplib.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+
+using half = uint16_t;
+
+// Helper functions
+int16_t
+clampI16(int32_t value, bool clamp)
+{
+    if (!clamp) {
+        return static_cast<int16_t>(value);
+    }
+
+    return std::clamp(value,
+            static_cast<int32_t>(std::numeric_limits<int16_t>::min()),
+            static_cast<int32_t>(std::numeric_limits<int16_t>::max()));
+}
+
+uint16_t
+clampU16(uint32_t value, bool clamp)
+{
+    if (!clamp) {
+        return static_cast<uint16_t>(value);
+    }
+
+    return std::clamp(value,
+            static_cast<uint32_t>(std::numeric_limits<uint16_t>::min()),
+            static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()));
+}
+
+uint16_t
+clampF16(uint16_t value, bool clamp)
+{
+    if (!clamp) {
+        return value;
+    }
+
+    // Values of one and zero in fp16.
+    constexpr uint16_t one = 0x3c00;
+    constexpr uint16_t zero = 0x0;
+    ArmISA::FPSCR fpscr1, fpscr2;
+
+    // If value > one, set to one, then if value < zero set to zero.
+    uint16_t imm = fplibMin(value, one, fpscr1);
+    return fplibMax(imm, zero, fpscr2);
+}
+
+
+
+
+// Begin instruction execute definitions
+void Inst_VOP3P__V_PK_MAD_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](int16_t S0, int16_t S1, int16_t S2, bool clamp) -> int16_t
+    {
+        return clampI16(S0 * S1 + S2, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void
+Inst_VOP3P__V_PK_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
+    {
+        // Only return lower 16 bits of result - This operation cannot clamp.
+        uint32_t D = S0 * S1;
+        uint16_t Dh = D & 0xFFFF;
+        return Dh;
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_ADD_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        return clampI16(S0 + S1, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_SUB_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        return clampI16(S0 - S1, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
+    {
+        unsigned shift_val = bits(S0, 3, 0);
+
+        // Shift does not clamp
+        return S1 << shift_val;
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
+    {
+        unsigned shift_val = bits(S0, 3, 0);
+
+        return S1 >> shift_val;
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_ASHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        // Sign extend to larger type to ensure we don't lose sign bits when
+        // shifting.
+        int32_t S1e = S1;
+        unsigned shift_val = bits(S0, 3, 0);
+
+        return S1e >> shift_val;
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        return clampI16((S0 >= S1) ? S0 : S1, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        return clampI16((S0 < S1) ? S0 : S1, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MAD_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint16_t S0, uint16_t S1, uint16_t S2, bool clamp) -> uint16_t
+    {
+        return clampU16(S0 * S1 + S2, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
+    {
+        return clampU16(S0 + S1, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
+    {
+        return clampU16(S0 - S1, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
+    {
+        return clampU16((S0 >= S1) ? S0 : S1, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
+    {
+        return clampU16((S0 < S1) ? S0 : S1, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_FMA_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, half S2, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibMulAdd(S2, S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibAdd(S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibMul(S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibMin(S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibMax(S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+} // namespace VegaISA
+} // namespace gem5
--- a/src/arch/amdgpu/vega/insts/vop3p.hh
+++ b/src/arch/amdgpu/vega/insts/vop3p.hh
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_VEGA_INSTS_VOP3P_HH__
+#define __ARCH_VEGA_INSTS_VOP3P_HH__
+
+#include "arch/amdgpu/vega/gpu_decoder.hh"
+#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
+#include "arch/amdgpu/vega/insts/op_encodings.hh"
+#include "debug/VEGA.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // Two source operands with two 16-bit values in a dword
+    class Inst_VOP3P__2OP_X16 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__2OP_X16(InFmt_VOP3P *iFmt, const std::string& name)
+            : Inst_VOP3P(iFmt, name)
+        {
+            setFlag(ALU);
+        }
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 4;
+              case 1: // src1
+                return 4;
+              case 2: // dst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        }
+
+    };
+
+    // Three source operands with two 16-bit values in a dword
+    class Inst_VOP3P__3OP_X16 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__3OP_X16(InFmt_VOP3P *iFmt, const std::string& name)
+            : Inst_VOP3P(iFmt, name)
+        {
+            setFlag(ALU);
+        }
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 4;
+              case 1: // src1
+                return 4;
+              case 2: // src2
+                return 4;
+              case 3: // dst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        }
+
+    };
+
+
+
+    // Begin instruction implementations
+    class Inst_VOP3P__V_PK_MAD_I16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAD_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_pk_mad_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MUL_LO_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MUL_LO_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_mul_lo_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_ADD_I16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_ADD_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_add_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_SUB_I16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_SUB_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_sub_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_LSHLREV_B16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_LSHLREV_B16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_lshlrev_b16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_LSHRREV_B16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_LSHRREV_B16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_lshrrev_b16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_ASHRREV_B16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_ASHRREV_B16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_ashrrev_b16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MAX_I16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAX_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_max_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MIN_I16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MIN_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_min_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MAD_U16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAD_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_pk_mad_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_ADD_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_ADD_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_add_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_SUB_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_SUB_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_sub_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MAX_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAX_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_max_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MIN_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MIN_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_min_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_FMA_F16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_FMA_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_pk_fma_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_ADD_F16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_ADD_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_add_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MUL_F16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MUL_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_mul_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MIN_F16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MIN_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_min_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MAX_F16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAX_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_max_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+} // namespace VegaISA
+} // namespace gem5
+
+#endif // __ARCH_VEGA_INSTS_VOP3P_HH__