diff --git a/src/arch/amdgpu/vega/SConscript b/src/arch/amdgpu/vega/SConscript
index 9c6a01bf81..019ef279b3 100644
--- a/src/arch/amdgpu/vega/SConscript
+++ b/src/arch/amdgpu/vega/SConscript
@@ -56,4 +56,6 @@ if env['CONF']['TARGET_GPU_ISA'] == 'vega':
     Source('isa.cc')
     Source('registers.cc')
 
+    Source('insts/vop3p.cc')
+
     DebugFlag('VEGA', 'Debug flag for VEGA GPU ISA')
diff --git a/src/arch/amdgpu/vega/decoder.cc b/src/arch/amdgpu/vega/decoder.cc
index 065f8c8493..2850640af2 100644
--- a/src/arch/amdgpu/vega/decoder.cc
+++ b/src/arch/amdgpu/vega/decoder.cc
@@ -34,6 +34,7 @@
 #include "arch/amdgpu/vega/gpu_decoder.hh"
 #include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
 #include "arch/amdgpu/vega/insts/instructions.hh"
+#include "arch/amdgpu/vega/insts/vop3p.hh"
 
 namespace gem5
 {
@@ -3629,6 +3630,7 @@ namespace VegaISA
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_PK_MOV_B32,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -3687,8 +3689,7 @@ namespace VegaISA
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F64_16X16X4F64,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -12920,134 +12921,115 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MAD_I16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAD_I16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MUL_LO_U16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MUL_LO_U16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_ADD_I16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_ADD_I16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_SUB_I16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_SUB_I16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_LSHLREV_B16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_LSHLREV_B16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_LSHRREV_B16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_LSHRREV_B16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_ASHRREV_I16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_ASHRREV_B16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MAX_I16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAX_I16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MIN_I16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MIN_I16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MAD_U16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAD_U16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_ADD_U16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_ADD_U16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_SUB_U16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_SUB_U16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MAX_U16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAX_U16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MIN_U16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MIN_U16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_FMA_F16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_FMA_F16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_ADD_F16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_ADD_F16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MUL_F16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MUL_F16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MIN_F16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MIN_F16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MAX_F16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAX_F16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
@@ -13071,6 +13053,19 @@ namespace VegaISA
         return nullptr;
     }
 
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_PK_MOV_B32(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_PK_MOV_B32(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
     GPUStaticInst*
     Decoder::decode_invalid(MachInst iFmt)
     {
diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh
index af989e0cc7..2a6f9370fb 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -1585,6 +1585,8 @@ namespace VegaISA
         GPUStaticInst* decode_OP_VOP3P__V_MAD_MIX_F32(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXLO_F16(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXHI_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst);
         GPUStaticInst* subDecode_OPU_VOP3(MachInst);
         GPUStaticInst* subDecode_OP_DS(MachInst);
         GPUStaticInst* subDecode_OP_FLAT(MachInst);
@@ -1908,7 +1910,27 @@ namespace VegaISA
         unsigned int       NEG : 3;
     };
 
-    union InstFormat {
+    struct InFmt_VOP3P_MAI
+    {
+        unsigned int      VDST : 8;
+        unsigned int      CBSZ : 3;
+        unsigned int      ABID : 4;
+        unsigned int    ACC_CD : 1;
+        unsigned int        OP : 7;
+        unsigned int  ENCODING : 9;
+    };
+
+    struct InFmt_VOP3P_MAI_1
+    {
+        unsigned int  SRC0 : 9;
+        unsigned int  SRC1 : 9;
+        unsigned int  SRC2 : 9;
+        unsigned int   ACC : 2;
+        unsigned int  BLGP : 3;
+    };
+
+    union InstFormat
+    {
         InFmt_DS            iFmt_DS;
         InFmt_DS_1          iFmt_DS_1;
         InFmt_EXP           iFmt_EXP;
@@ -1941,6 +1963,8 @@ namespace VegaISA
         InFmt_VOP_SDWAB     iFmt_VOP_SDWAB;
         InFmt_VOP3P         iFmt_VOP3P;
         InFmt_VOP3P_1       iFmt_VOP3P_1;
+        InFmt_VOP3P_MAI     iFmt_VOP3P_MAI;
+        InFmt_VOP3P_MAI_1   iFmt_VOP3P_MAI_1;
         uint32_t            imm_u32;
         float               imm_f32;
     }; // union InstFormat
diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc
index cd4ad74e6e..855f91699f 100644
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
@@ -45908,5 +45908,143 @@ namespace VegaISA
     {
         panicUnimplemented();
     } // execute
+    // --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
+
+    Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt)
+        : Inst_VOP3P(iFmt, "v_pk_mov_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP3P__V_PK_MOV_B32
+
+    Inst_VOP3P__V_PK_MOV_B32::~Inst_VOP3P__V_PK_MOV_B32()
+    {
+    } // ~Inst_VOP3P__V_PK_MOV_B32
+
+    // D.u[63:32] = S1.u[31:0]; D.u[31:0] = S0.u[31:0].
+    void
+    Inst_VOP3P__V_PK_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        // This is a special case of packed instructions which operates on
+        // 64-bit inputs/outputs and not 32-bit.
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        // Only OPSEL[1:0] are used
+        // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0
+
+        int opsel = instData.OPSEL;
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1
+                uint64_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                                   : bits(src0[lane], 31, 0);
+                uint64_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                                   : bits(src1[lane], 31, 0);
+
+                vdst[lane] = upper_dword << 32 | lower_dword;
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 class methods ---
+
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
+        Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(InFmt_VOP3P_MAI *iFmt)
+        : Inst_VOP3P_MAI(iFmt, "v_mfma_f64_16x16x4f64")
+    {
+        setFlag(ALU);
+    } // Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
+
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::
+        ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64()
+    {
+    } // ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64
+
+    // D(16x16F64) = A(16x4F64) x B(4x16F64) + C(16x16F64), 1 Blocks, 8
+    // pass, srcA/srcB 2 VGPR, srcC/D 8 VGPR
+    void
+    Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        int acc_offset = 0;
+        if (instData.ACC_CD) {
+            warn("ACC_CD not yet implemented\n");
+        }
+
+        // Handling of src2 is a bit tricky. The operator[] overload cannot
+        // be used for dword count > 2, and the dword count here is 8. Usually
+        // src2 is a VGPR/AccGPR, but it might also be constant. In order to
+        // use operator[] and handle constants, check for VGPR here and set
+        // a delta for each of the pairs of src2 GPRs.
+        int delta = isVectorReg(extData.SRC2) ? 2 : 0;
+
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2a(gpuDynInst, extData.SRC2+acc_offset);
+        ConstVecOperandF64 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
+        ConstVecOperandF64 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
+        ConstVecOperandF64 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
+
+        VecOperandF64 vdsta(gpuDynInst, instData.VDST+acc_offset);
+        VecOperandF64 vdstb(gpuDynInst, instData.VDST+acc_offset+2);
+        VecOperandF64 vdstc(gpuDynInst, instData.VDST+acc_offset+4);
+        VecOperandF64 vdstd(gpuDynInst, instData.VDST+acc_offset+6);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2a.readSrc();
+        src2b.readSrc();
+        src2c.readSrc();
+        src2d.readSrc();
+
+        double result[16][16];
+
+        // Load src2 into result. src2 is row major
+        for (int i = 0; i < 64; ++i) {
+            // src2a contains rows 0 - 3
+            result[(i/16)][(i%16)] = src2a[i];
+            // src2b contains rows 4 - 7
+            result[(i/16)+4][(i%16)] = src2b[i];
+            // src2c contains rows 8 - 11
+            result[(i/16)+8][(i%16)] = src2c[i];
+            // src2d contains rows 12 - 15
+            result[(i/16)+12][(i%16)] = src2d[i];
+        }
+
+        // Compute new result
+        for (int i = 0; i < 16; ++i) {
+            for (int j = 0; j < 16; ++j) {
+                for (int k = 0; k < 4; ++k) {
+                    // src0 is column major, src1 is row major
+                    int lane_A = 16*k + i;
+                    int lane_B = 16*k + j;
+                    result[i][j] += src0[lane_A] * src1[lane_B];
+                }
+            }
+        }
+
+        // Put result in dest VGPRs
+        for (int i = 0; i < 64; ++i) {
+            // vdsta contains rows 0 - 3
+            vdsta[i] = result[(i/16)][(i%16)];
+            // src2b contains rows 4 - 7
+            vdstb[i] = result[(i/16)+4][(i%16)];
+            // src2c contains rows 8 - 11
+            vdstc[i] = result[(i/16)+8][(i%16)];
+            // src2d contains rows 12 - 15
+            vdstd[i] = result[(i/16)+12][(i%16)];
+        }
+
+        vdsta.write();
+        vdstb.write();
+        vdstc.write();
+        vdstd.write();
+    } // execute
 } // namespace VegaISA
 } // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh
index ca349c365f..22423e14c6 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -43405,6 +43405,76 @@ namespace VegaISA
 
         void execute(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_DEC_X2
+
+    class Inst_VOP3P__V_PK_MOV_B32 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P*);
+        ~Inst_VOP3P__V_PK_MOV_B32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 8;
+              case 1: // src1
+                return 8;
+              case 2: // dst
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3P__V_PK_MOV_B32
+
+    class Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 : public Inst_VOP3P_MAI
+    {
+      public:
+        Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(InFmt_VOP3P_MAI*);
+        ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0 "A"
+                return 8;
+              case 1: // src1 "B"
+                return 8;
+              case 2: // src2 "C"
+                return 32;
+              case 3: // dst
+                return 32;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    };
 } // namespace VegaISA
 } // namespace gem5
 
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.cc b/src/arch/amdgpu/vega/insts/op_encodings.cc
index c934094d9b..c1302b8b49 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.cc
+++ b/src/arch/amdgpu/vega/insts/op_encodings.cc
@@ -1178,6 +1178,158 @@ namespace VegaISA
         disassembly = dis_stream.str();
     }
 
+    // --- Inst_VOP3P base class methods ---
+
+    Inst_VOP3P::Inst_VOP3P(InFmt_VOP3P *iFmt, const std::string &opcode)
+        : VEGAGPUStaticInst(opcode)
+    {
+        // copy first instruction DWORD
+        instData = iFmt[0];
+        // copy second instruction DWORD
+        extData = ((InFmt_VOP3P_1 *)iFmt)[1];
+    } // Inst_VOP3P
+
+    Inst_VOP3P::~Inst_VOP3P()
+    {
+    } // ~Inst_VOP3P
+
+    void
+    Inst_VOP3P::initOperandInfo()
+    {
+        // Also takes care of bitfield addr issue
+        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
+
+        int opNum = 0;
+
+        int numSrc = numSrcRegOperands();
+
+        for (opNum = 0; opNum < numSrc; opNum++) {
+            srcOps.emplace_back(srcs[opNum], getOperandSize(opNum), true,
+                                  isScalarReg(srcs[opNum]),
+                                  isVectorReg(srcs[opNum]), false);
+        }
+
+        // There is always one dest
+        // Needed because can't take addr of bitfield
+        int reg = instData.VDST;
+        dstOps.emplace_back(reg, getOperandSize(opNum), false,
+                              false, true, false);
+        opNum++;
+
+        assert(srcOps.size() == numSrcRegOperands());
+        assert(dstOps.size() == numDstRegOperands());
+    }
+
+    int
+    Inst_VOP3P::instSize() const
+    {
+        return 8;
+    } // instSize
+
+    void
+    Inst_VOP3P::generateDisassembly()
+    {
+        std::stringstream dis_stream;
+        dis_stream << _opcode << " ";
+
+        // There is always a dest and the index is after the src operands
+        // The output size much be a multiple of dword size
+        int dst_size = getOperandSize(numSrcRegOperands());
+
+        dis_stream << opSelectorToRegSym(instData.VDST + 0x100, dst_size / 4);
+
+        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
+        for (int opnum = 0; opnum < numSrcRegOperands(); opnum++) {
+            int num_regs = getOperandSize(opnum) / 4;
+            dis_stream << ", " << opSelectorToRegSym(srcs[opnum], num_regs);
+        }
+
+        // Print op_sel only if one is non-zero
+        if (instData.OPSEL) {
+            int opsel = instData.OPSEL;
+
+            dis_stream << " op_sel:[" << bits(opsel, 0, 0) << ","
+                    << bits(opsel, 1, 1) << "," << bits(opsel, 2, 2) << "]";
+        }
+
+        disassembly = dis_stream.str();
+    }
+
+    // --- Inst_VOP3P_MAI base class methods ---
+
+    Inst_VOP3P_MAI::Inst_VOP3P_MAI(InFmt_VOP3P_MAI *iFmt,
+                                   const std::string &opcode)
+        : VEGAGPUStaticInst(opcode)
+    {
+        // copy first instruction DWORD
+        instData = iFmt[0];
+        // copy second instruction DWORD
+        extData = ((InFmt_VOP3P_MAI_1 *)iFmt)[1];
+    } // Inst_VOP3P_MAI
+
+    Inst_VOP3P_MAI::~Inst_VOP3P_MAI()
+    {
+    } // ~Inst_VOP3P_MAI
+
+    void
+    Inst_VOP3P_MAI::initOperandInfo()
+    {
+        // Also takes care of bitfield addr issue
+        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
+
+        int opNum = 0;
+
+        int numSrc = numSrcRegOperands();
+
+        for (opNum = 0; opNum < numSrc; opNum++) {
+            srcOps.emplace_back(srcs[opNum], getOperandSize(opNum), true,
+                                  isScalarReg(srcs[opNum]),
+                                  isVectorReg(srcs[opNum]), false);
+        }
+
+        // There is always one dest
+        // Needed because can't take addr of bitfield
+        int reg = instData.VDST;
+        dstOps.emplace_back(reg, getOperandSize(opNum), false,
+                              false, true, false);
+        opNum++;
+
+        assert(srcOps.size() == numSrcRegOperands());
+        assert(dstOps.size() == numDstRegOperands());
+    }
+
+    int
+    Inst_VOP3P_MAI::instSize() const
+    {
+        return 8;
+    } // instSize
+
+    void
+    Inst_VOP3P_MAI::generateDisassembly()
+    {
+        std::stringstream dis_stream;
+        dis_stream << _opcode << " ";
+
+        // There is always a dest and the index is after the src operands
+        // The output size much be a multiple of dword size
+        int dst_size = getOperandSize(numSrcRegOperands());
+
+        // opSelectorToRegSym handles formating for us. VDST is always VGPR
+        // so only the last 8 bits are used. This adds the implicit 9th bit
+        // which is 1 for VGPRs as VGPR op nums are from 256-255.
+        int dst_opnum = instData.VDST + 0x100;
+
+        dis_stream << opSelectorToRegSym(dst_opnum, dst_size / 4);
+
+        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
+        for (int opnum = 0; opnum < numSrcRegOperands(); opnum++) {
+            int num_regs = getOperandSize(opnum) / 4;
+            dis_stream << ", " << opSelectorToRegSym(srcs[opnum], num_regs);
+        }
+
+        disassembly = dis_stream.str();
+    }
+
     // --- Inst_DS base class methods ---
 
     Inst_DS::Inst_DS(InFmt_DS *iFmt, const std::string &opcode)
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh
index a1c5e99c91..bf4ee2b0c9 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -491,6 +491,149 @@ namespace VegaISA
         bool hasSecondDword(InFmt_VOP3B *);
     }; // Inst_VOP3B
 
+    class Inst_VOP3P : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_VOP3P(InFmt_VOP3P*, const std::string &opcode);
+        ~Inst_VOP3P();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        void initOperandInfo() override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_VOP3P instData;
+        // second instruction DWORD
+        InFmt_VOP3P_1 extData;
+
+        template<typename T>
+        void vop3pHelper(GPUDynInstPtr gpuDynInst,
+                        T (*fOpImpl)(T, T, bool))
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+            ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
+            ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
+            VecOperandU32 D(gpuDynInst, instData.VDST);
+
+            S0.readSrc();
+            S1.readSrc();
+
+            int opLo = instData.OPSEL;
+            int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
+            int negLo = extData.NEG;
+            int negHi = instData.NEG_HI;
+            bool clamp = instData.CLMP;
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
+                                          word<T>(S1[lane], opHi, negHi, 1),
+                                          clamp);
+                    T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
+                                          word<T>(S1[lane], opLo, negLo, 1),
+                                          clamp);
+
+                    uint16_t upper_raw =
+                        *reinterpret_cast<uint16_t*>(&upper_val);
+                    uint16_t lower_raw =
+                        *reinterpret_cast<uint16_t*>(&lower_val);
+
+                    D[lane] = upper_raw << 16 | lower_raw;
+                }
+            }
+
+            D.write();
+        }
+
+        template<typename T>
+        void vop3pHelper(GPUDynInstPtr gpuDynInst,
+                        T (*fOpImpl)(T, T, T, bool))
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+            ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
+            ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
+            ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
+            VecOperandU32 D(gpuDynInst, instData.VDST);
+
+            S0.readSrc();
+            S1.readSrc();
+            S2.readSrc();
+
+            int opLo = instData.OPSEL;
+            int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
+            int negLo = extData.NEG;
+            int negHi = instData.NEG_HI;
+            bool clamp = instData.CLMP;
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
+                                          word<T>(S1[lane], opHi, negHi, 1),
+                                          word<T>(S2[lane], opHi, negHi, 2),
+                                          clamp);
+                    T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
+                                          word<T>(S1[lane], opLo, negLo, 1),
+                                          word<T>(S2[lane], opLo, negLo, 2),
+                                          clamp);
+
+                    uint16_t upper_raw =
+                        *reinterpret_cast<uint16_t*>(&upper_val);
+                    uint16_t lower_raw =
+                        *reinterpret_cast<uint16_t*>(&lower_val);
+
+                    D[lane] = upper_raw << 16 | lower_raw;
+                }
+            }
+
+            D.write();
+        }
+
+      private:
+        bool hasSecondDword(InFmt_VOP3P *);
+
+        template<typename T>
+        T
+        word(uint32_t data, int opSel, int neg, int opSelBit)
+        {
+            // This method assumes two words packed into a dword
+            static_assert(sizeof(T) == 2);
+
+            bool select = bits(opSel, opSelBit, opSelBit);
+            uint16_t raw = select ? bits(data, 31, 16)
+                                  : bits(data, 15, 0);
+
+            // Apply input modifiers. This may seem odd, but the hardware
+            // just flips the MSb instead of doing unary negation.
+            bool negate = bits(neg, opSelBit, opSelBit);
+            if (negate) {
+                raw ^= 0x8000;
+            }
+
+            return *reinterpret_cast<T*>(&raw);
+        }
+    }; // Inst_VOP3P
+
+    class Inst_VOP3P_MAI : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_VOP3P_MAI(InFmt_VOP3P_MAI*, const std::string &opcode);
+        ~Inst_VOP3P_MAI();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        void initOperandInfo() override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_VOP3P_MAI instData;
+        // second instruction DWORD
+        InFmt_VOP3P_MAI_1 extData;
+
+      private:
+        bool hasSecondDword(InFmt_VOP3P_MAI *);
+    }; // Inst_VOP3P
+
     class Inst_DS : public VEGAGPUStaticInst
     {
       public:
diff --git a/src/arch/amdgpu/vega/insts/vop3p.cc b/src/arch/amdgpu/vega/insts/vop3p.cc
new file mode 100644
index 0000000000..a356d3bab3
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop3p.cc
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/vop3p.hh"
+
+#include "arch/arm/insts/fplib.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+
+using half = uint16_t;
+
+// Helper functions
+int16_t
+clampI16(int32_t value, bool clamp)
+{
+    if (!clamp) {
+        return static_cast<int16_t>(value);
+    }
+
+    return std::clamp(value,
+            static_cast<int32_t>(std::numeric_limits<int16_t>::min()),
+            static_cast<int32_t>(std::numeric_limits<int16_t>::max()));
+}
+
+uint16_t
+clampU16(uint32_t value, bool clamp)
+{
+    if (!clamp) {
+        return static_cast<uint16_t>(value);
+    }
+
+    return std::clamp(value,
+            static_cast<uint32_t>(std::numeric_limits<uint16_t>::min()),
+            static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()));
+}
+
+uint16_t
+clampF16(uint16_t value, bool clamp)
+{
+    if (!clamp) {
+        return value;
+    }
+
+    // Values of one and zero in fp16.
+    constexpr uint16_t one = 0x3c00;
+    constexpr uint16_t zero = 0x0;
+    ArmISA::FPSCR fpscr1, fpscr2;
+
+    // If value > one, set to one, then if value < zero set to zero.
+    uint16_t imm = fplibMin(value, one, fpscr1);
+    return fplibMax(imm, zero, fpscr2);
+}
+
+
+
+
+// Begin instruction execute definitions
+void Inst_VOP3P__V_PK_MAD_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](int16_t S0, int16_t S1, int16_t S2, bool clamp) -> int16_t
+    {
+        return clampI16(S0 * S1 + S2, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void
+Inst_VOP3P__V_PK_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
+    {
+        // Only return lower 16 bits of result - This operation cannot clamp.
+        uint32_t D = S0 * S1;
+        uint16_t Dh = D & 0xFFFF;
+        return Dh;
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_ADD_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        return clampI16(S0 + S1, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_SUB_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        return clampI16(S0 - S1, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
+    {
+        unsigned shift_val = bits(S0, 3, 0);
+
+        // Shift does not clamp
+        return S1 << shift_val;
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
+    {
+        unsigned shift_val = bits(S0, 3, 0);
+
+        return S1 >> shift_val;
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_ASHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        // Sign extend to larger type to ensure we don't lose sign bits when
+        // shifting.
+        int32_t S1e = S1;
+        unsigned shift_val = bits(S0, 3, 0);
+
+        return S1e >> shift_val;
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        return clampI16((S0 >= S1) ? S0 : S1, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        return clampI16((S0 < S1) ? S0 : S1, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MAD_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint16_t S0, uint16_t S1, uint16_t S2, bool clamp) -> uint16_t
+    {
+        return clampU16(S0 * S1 + S2, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
+    {
+        return clampU16(S0 + S1, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
+    {
+        return clampU16(S0 - S1, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
+    {
+        return clampU16((S0 >= S1) ? S0 : S1, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
+    {
+        return clampU16((S0 < S1) ? S0 : S1, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_FMA_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, half S2, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibMulAdd(S2, S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibAdd(S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibMul(S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibMin(S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibMax(S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3p.hh b/src/arch/amdgpu/vega/insts/vop3p.hh
new file mode 100644
index 0000000000..56f0f80435
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop3p.hh
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_VEGA_INSTS_VOP3P_HH__
+#define __ARCH_VEGA_INSTS_VOP3P_HH__
+
+#include "arch/amdgpu/vega/gpu_decoder.hh"
+#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
+#include "arch/amdgpu/vega/insts/op_encodings.hh"
+#include "debug/VEGA.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // Two source operands with two 16-bit values in a dword
+    class Inst_VOP3P__2OP_X16 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__2OP_X16(InFmt_VOP3P *iFmt, const std::string& name)
+            : Inst_VOP3P(iFmt, name)
+        {
+            setFlag(ALU);
+        }
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 4;
+              case 1: // src1
+                return 4;
+              case 2: // dst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        }
+
+    };
+
+    // Three source operands with two 16-bit values in a dword
+    class Inst_VOP3P__3OP_X16 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__3OP_X16(InFmt_VOP3P *iFmt, const std::string& name)
+            : Inst_VOP3P(iFmt, name)
+        {
+            setFlag(ALU);
+        }
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 4;
+              case 1: // src1
+                return 4;
+              case 2: // src2
+                return 4;
+              case 3: // dst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        }
+
+    };
+
+
+
+    // Begin instruction implementations
+    class Inst_VOP3P__V_PK_MAD_I16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAD_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_pk_mad_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MUL_LO_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MUL_LO_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_mul_lo_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_ADD_I16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_ADD_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_add_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_SUB_I16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_SUB_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_sub_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_LSHLREV_B16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_LSHLREV_B16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_lshlrev_b16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_LSHRREV_B16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_LSHRREV_B16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_lshrrev_b16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_ASHRREV_B16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_ASHRREV_B16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_ashrrev_b16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MAX_I16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAX_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_max_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MIN_I16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MIN_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_min_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MAD_U16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAD_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_pk_mad_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_ADD_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_ADD_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_add_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_SUB_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_SUB_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_sub_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MAX_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAX_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_max_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MIN_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MIN_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_min_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_FMA_F16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_FMA_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_pk_fma_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_ADD_F16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_ADD_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_add_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MUL_F16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MUL_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_mul_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MIN_F16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MIN_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_min_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MAX_F16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAX_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_max_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+} // namespace VegaISA
+} // namespace gem5
+
+#endif // __ARCH_VEGA_INSTS_VOP3P_HH__