diff --git a/src/arch/amdgpu/vega/decoder.cc b/src/arch/amdgpu/vega/decoder.cc
index 27474d5109..5e2402a4af 100644
--- a/src/arch/amdgpu/vega/decoder.cc
+++ b/src/arch/amdgpu/vega/decoder.cc
@@ -3614,15 +3614,15 @@ namespace VegaISA
         &Decoder::decode_OP_VOP3P__V_MAD_MIX_F32,
         &Decoder::decode_OP_VOP3P__V_MAD_MIXLO_F16,
         &Decoder::decode_OP_VOP3P__V_MAD_MIXHI_F16,
+        &Decoder::decode_OP_VOP3P__V_DOT2_F32_F16,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_DOT2_I32_I16,
+        &Decoder::decode_OP_VOP3P__V_DOT2_U32_U16,
+        &Decoder::decode_OP_VOP3P__V_DOT4_I32_I8,
+        &Decoder::decode_OP_VOP3P__V_DOT4_U32_U8,
+        &Decoder::decode_OP_VOP3P__V_DOT8_I32_I4,
+        &Decoder::decode_OP_VOP3P__V_DOT8_U32_U4,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -3667,8 +3667,8 @@ namespace VegaISA
         &Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X16I8,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_ACCVGPR_READ,
+        &Decoder::decode_OP_VOP3P__V_ACCVGPR_WRITE,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -13125,6 +13125,48 @@ namespace VegaISA
         return new Inst_VOP3P__V_PK_MOV_B32(&iFmt->iFmt_VOP3P);
     }
 
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT2_F32_F16(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT2_F32_F16(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT2_I32_I16(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT2_I32_I16(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT2_U32_U16(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT2_U32_U16(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT4_I32_I8(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT4_I32_I8(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT4_U32_U8(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT4_U32_U8(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT8_I32_I4(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT8_I32_I4(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT8_U32_U4(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT8_U32_U4(&iFmt->iFmt_VOP3P);
+    }
+
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst iFmt)
     {
@@ -13132,6 +13174,18 @@ namespace VegaISA
                 &iFmt->iFmt_VOP3P_MAI);
     }
 
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_ACCVGPR_READ(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_ACCVGPR_READ(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_ACCVGPR_WRITE(&iFmt->iFmt_VOP3P);
+    }
+
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst iFmt)
     {
diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh
index 337011cdb8..48084a6913 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -1597,7 +1597,16 @@ namespace VegaISA
         GPUStaticInst* decode_OP_VOP3P__V_PK_MUL_F32(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_PK_ADD_F32(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_F32_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_I32_I16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_U32_U16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT4_I32_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT4_U32_U8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT8_I32_I4(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT8_U32_U4(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_READ(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst);
         GPUStaticInst* subDecode_OPU_VOP3(MachInst);
         GPUStaticInst* subDecode_OP_DS(MachInst);
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh
index d980eb90bc..9ab7b84974 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -588,6 +588,56 @@ namespace VegaISA
             D.write();
         }
 
+        void
+        dotHelper(GPUDynInstPtr gpuDynInst,
+                  uint32_t (*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+            ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
+            ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
+            ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
+            VecOperandU32 D(gpuDynInst, instData.VDST);
+
+            S0.readSrc();
+            S1.readSrc();
+            S2.readSrc();
+
+            // OPSEL[2] and OPSEL_HI2 are unused. Craft two dwords where:
+            // dword1[15:0]  is upper/lower 16b of src0 based on opsel[0]
+            // dword1[31:15] is upper/lower 16b of src0 based on opsel_hi[0]
+            // dword2[15:0]  is upper/lower 16b of src1 based on opsel[1]
+            // dword2[31:15] is upper/lower 16b of src1 based on opsel_hi[1]
+            int opLo = instData.OPSEL;
+            int opHi = extData.OPSEL_HI;
+            int negLo = extData.NEG;
+            int negHi = instData.NEG_HI;
+            bool clamp = instData.CLMP;
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    uint32_t dword1l =
+                        word<uint16_t>(S0[lane], opLo, negLo, 0);
+                    uint32_t dword1h =
+                        word<uint16_t>(S0[lane], opHi, negHi, 0);
+                    uint32_t dword2l =
+                        word<uint16_t>(S1[lane], opLo, negLo, 1);
+                    uint32_t dword2h =
+                        word<uint16_t>(S1[lane], opHi, negHi, 1);
+
+                    uint32_t dword1 = (dword1h << 16) | dword1l;
+                    uint32_t dword2 = (dword2h << 16) | dword2l;
+
+                    // Take in two uint32_t dwords and one src2 dword. The
+                    // function will need to call bits to break up to the
+                    // correct size and then reinterpret cast to the correct
+                    // value.
+                    D[lane] = fOpImpl(dword1, dword2, S2[lane], clamp);
+                }
+            }
+
+            D.write();
+        }
+
       private:
         bool hasSecondDword(InFmt_VOP3P *);
 
diff --git a/src/arch/amdgpu/vega/insts/vop3p.cc b/src/arch/amdgpu/vega/insts/vop3p.cc
index a356d3bab3..eddb1e7ad5 100644
--- a/src/arch/amdgpu/vega/insts/vop3p.cc
+++ b/src/arch/amdgpu/vega/insts/vop3p.cc
@@ -42,6 +42,38 @@ namespace VegaISA
 using half = uint16_t;
 
 // Helper functions
+template<int N>
+int32_t
+dotClampI(int32_t value, bool clamp)
+{
+    // Only valid for N < 32
+    static_assert(N < 32);
+
+    if (!clamp) {
+        return static_cast<int32_t>(value);
+    }
+
+    int32_t min = -(1 << (N - 1));
+    int32_t max = (1 << (N - 1)) - 1;
+    return std::clamp<int32_t>(value, min, max);
+}
+
+template<int N>
+uint32_t
+dotClampU(uint32_t value, bool clamp)
+{
+    // Only valid for N < 32
+    static_assert(N < 32);
+
+    if (!clamp) {
+        return static_cast<int32_t>(value);
+    }
+
+    uint32_t min = 0;
+    uint32_t max = (1 << N) - 1;
+    return std::clamp<int32_t>(value, min, max);
+}
+
 int16_t
 clampI16(int32_t value, bool clamp)
 {
@@ -83,6 +115,16 @@ clampF16(uint16_t value, bool clamp)
     return fplibMax(imm, zero, fpscr2);
 }
 
+float
+clampF32(float value, bool clamp)
+{
+    if (!clamp) {
+        return value;
+    }
+
+    return std::clamp(value, 0.0f, 1.0f);
+}
+
 
 
 
@@ -298,5 +340,296 @@ void Inst_VOP3P__V_PK_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
     vop3pHelper<half>(gpuDynInst, opImpl);
 }
 
+void Inst_VOP3P__V_DOT2_F32_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 16;
+
+        constexpr unsigned elems = 32 / INBITS;
+        half S0[elems];
+        half S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        float S2 = *reinterpret_cast<float*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        half C[elems];
+        float Csum = 0.0f;
+
+        for (int i = 0; i < elems; ++i) {
+            ArmISA::FPSCR fpscr;
+            C[i] = fplibMul(S0[i], S1[i], fpscr);
+            uint32_t conv =
+                ArmISA::fplibConvert<uint16_t, uint32_t>(
+                        C[i], ArmISA::FPRounding_TIEEVEN, fpscr);
+            Csum += clampF32(*reinterpret_cast<float*>(&conv), clamp);
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT2_I32_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 16;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        int32_t C[elems];
+        int32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
+            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
+            Csum += C[i];
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT2_U32_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 16;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        // Compute components individually to prevent overflow across packing
+        uint32_t C[elems];
+        uint32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = S0[i] * S1[i];
+            C[i] = dotClampU<INBITS>(C[i], clamp);
+            Csum += C[i];
+        }
+
+        Csum += S2;
+
+        return Csum;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT4_I32_I8::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 8;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        int32_t C[elems];
+        int32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
+            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
+            Csum += C[i];
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT4_U32_U8::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 8;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        // Compute components individually to prevent overflow across packing
+        uint32_t C[elems];
+        uint32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = S0[i] * S1[i];
+            C[i] = dotClampU<INBITS>(C[i], clamp);
+            Csum += C[i];
+        }
+
+        Csum += S2;
+
+        return Csum;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT8_I32_I4::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 4;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        int32_t C[elems];
+        int32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
+            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
+            Csum += C[i];
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT8_U32_U4::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 4;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        // Compute components individually to prevent overflow across packing
+        uint32_t C[elems];
+        uint32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = S0[i] * S1[i];
+            C[i] = dotClampU<INBITS>(C[i], clamp);
+            Csum += C[i];
+        }
+
+        Csum += S2;
+
+        return Csum;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst)
+{
+    // The Acc register file is not supported in gem5 and has been removed
+    // in MI200. Therefore this instruction becomes a mov.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+    VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+    src.readSrc();
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            vdst[lane] = src[lane];
+        }
+    }
+
+    vdst.write();
+}
+
+void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst)
+{
+    // The Acc register file is not supported in gem5 and has been removed
+    // in MI200. Therefore this instruction becomes a mov.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+    VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+    src.readSrc();
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            vdst[lane] = src[lane];
+        }
+    }
+
+    vdst.write();
+}
+
 } // namespace VegaISA
 } // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3p.hh b/src/arch/amdgpu/vega/insts/vop3p.hh
index 56f0f80435..fbb81f12f7 100644
--- a/src/arch/amdgpu/vega/insts/vop3p.hh
+++ b/src/arch/amdgpu/vega/insts/vop3p.hh
@@ -42,6 +42,41 @@ namespace gem5
 
 namespace VegaISA
 {
+    // One source operand
+    class Inst_VOP3P__1OP : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__1OP(InFmt_VOP3P *iFmt, const std::string& name)
+            : Inst_VOP3P(iFmt, name)
+        {
+            setFlag(ALU);
+        }
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 1; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src
+                return 4;
+              case 1: // dst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        }
+
+    };
+
     // Two source operands with two 16-bit values in a dword
     class Inst_VOP3P__2OP_X16 : public Inst_VOP3P
     {
@@ -310,6 +345,96 @@ namespace VegaISA
 
         void execute(GPUDynInstPtr gpuDynInst) override;
     };
+
+    class Inst_VOP3P__V_DOT2_F32_F16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT2_F32_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_f32_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT2_I32_I16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT2_I32_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_i32_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT2_U32_U16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT2_U32_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_u32_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT4_I32_I8 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT4_I32_I8(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot4_i32_i8")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT4_U32_U8 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT4_U32_U8(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot4_u32_u8")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT8_I32_I4 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT8_I32_I4(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot8_i32_i4")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT8_U32_U4 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT8_U32_U4(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot8_u32_u4")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_ACCVGPR_READ : public Inst_VOP3P__1OP
+    {
+      public:
+        Inst_VOP3P__V_ACCVGPR_READ(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__1OP(iFmt, "v_accvgpr_read")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_ACCVGPR_WRITE : public Inst_VOP3P__1OP
+    {
+      public:
+        Inst_VOP3P__V_ACCVGPR_WRITE(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__1OP(iFmt, "v_accvgpr_write")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
 } // namespace VegaISA
 } // namespace gem5