From 31e63b01ad19ac3f6a00eaaa51507bc356298ef5 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Tue, 12 Dec 2023 00:59:33 -0600
Subject: [PATCH] arch-vega: Add vop3p DOT instructions

Implemented according to the ISA spec. Validated with silion. In
particular the sign extend is important for the signed variants and the
unsigned variants seem to overflow lanes (hence why there is no mask()
in the unsigned varints. FP16 -> FP32 continues using ARM's fplib.

Tested vs. an MI210. Clamp has not been verified.

Change-Id: Ifc09aecbc1ef2c92a5524a43ca529983018a6d59
---
 src/arch/amdgpu/vega/decoder.cc            |  72 ++++-
 src/arch/amdgpu/vega/gpu_decoder.hh        |   9 +
 src/arch/amdgpu/vega/insts/op_encodings.hh |  50 ++++
 src/arch/amdgpu/vega/insts/vop3p.cc        | 333 +++++++++++++++++++++
 src/arch/amdgpu/vega/insts/vop3p.hh        | 125 ++++++++
 5 files changed, 580 insertions(+), 9 deletions(-)

diff --git a/src/arch/amdgpu/vega/decoder.cc b/src/arch/amdgpu/vega/decoder.cc
index 27474d5109..5e2402a4af 100644
--- a/src/arch/amdgpu/vega/decoder.cc
+++ b/src/arch/amdgpu/vega/decoder.cc
@@ -3614,15 +3614,15 @@ namespace VegaISA
         &Decoder::decode_OP_VOP3P__V_MAD_MIX_F32,
         &Decoder::decode_OP_VOP3P__V_MAD_MIXLO_F16,
         &Decoder::decode_OP_VOP3P__V_MAD_MIXHI_F16,
+        &Decoder::decode_OP_VOP3P__V_DOT2_F32_F16,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_DOT2_I32_I16,
+        &Decoder::decode_OP_VOP3P__V_DOT2_U32_U16,
+        &Decoder::decode_OP_VOP3P__V_DOT4_I32_I8,
+        &Decoder::decode_OP_VOP3P__V_DOT4_U32_U8,
+        &Decoder::decode_OP_VOP3P__V_DOT8_I32_I4,
+        &Decoder::decode_OP_VOP3P__V_DOT8_U32_U4,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -3667,8 +3667,8 @@ namespace VegaISA
         &Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X16I8,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_ACCVGPR_READ,
+        &Decoder::decode_OP_VOP3P__V_ACCVGPR_WRITE,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -13125,6 +13125,48 @@ namespace VegaISA
         return new Inst_VOP3P__V_PK_MOV_B32(&iFmt->iFmt_VOP3P);
     }
 
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT2_F32_F16(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT2_F32_F16(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT2_I32_I16(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT2_I32_I16(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT2_U32_U16(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT2_U32_U16(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT4_I32_I8(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT4_I32_I8(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT4_U32_U8(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT4_U32_U8(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT8_I32_I4(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT8_I32_I4(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT8_U32_U4(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT8_U32_U4(&iFmt->iFmt_VOP3P);
+    }
+
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst iFmt)
     {
@@ -13132,6 +13174,18 @@ namespace VegaISA
                 &iFmt->iFmt_VOP3P_MAI);
     }
 
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_ACCVGPR_READ(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_ACCVGPR_READ(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_ACCVGPR_WRITE(&iFmt->iFmt_VOP3P);
+    }
+
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst iFmt)
     {
diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh
index 337011cdb8..48084a6913 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -1597,7 +1597,16 @@ namespace VegaISA
         GPUStaticInst* decode_OP_VOP3P__V_PK_MUL_F32(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_PK_ADD_F32(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_F32_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_I32_I16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_U32_U16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT4_I32_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT4_U32_U8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT8_I32_I4(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT8_U32_U4(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_READ(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst);
         GPUStaticInst* subDecode_OPU_VOP3(MachInst);
         GPUStaticInst* subDecode_OP_DS(MachInst);
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh
index d980eb90bc..9ab7b84974 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -588,6 +588,56 @@ namespace VegaISA
             D.write();
         }
 
+        void
+        dotHelper(GPUDynInstPtr gpuDynInst,
+                  uint32_t (*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+            ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
+            ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
+            ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
+            VecOperandU32 D(gpuDynInst, instData.VDST);
+
+            S0.readSrc();
+            S1.readSrc();
+            S2.readSrc();
+
+            // OPSEL[2] and OPSEL_HI2 are unused. Craft two dwords where:
+            // dword1[15:0]  is upper/lower 16b of src0 based on opsel[0]
+            // dword1[31:15] is upper/lower 16b of src0 based on opsel_hi[0]
+            // dword2[15:0]  is upper/lower 16b of src1 based on opsel[1]
+            // dword2[31:15] is upper/lower 16b of src1 based on opsel_hi[1]
+            int opLo = instData.OPSEL;
+            int opHi = extData.OPSEL_HI;
+            int negLo = extData.NEG;
+            int negHi = instData.NEG_HI;
+            bool clamp = instData.CLMP;
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    uint32_t dword1l =
+                        word<uint16_t>(S0[lane], opLo, negLo, 0);
+                    uint32_t dword1h =
+                        word<uint16_t>(S0[lane], opHi, negHi, 0);
+                    uint32_t dword2l =
+                        word<uint16_t>(S1[lane], opLo, negLo, 1);
+                    uint32_t dword2h =
+                        word<uint16_t>(S1[lane], opHi, negHi, 1);
+
+                    uint32_t dword1 = (dword1h << 16) | dword1l;
+                    uint32_t dword2 = (dword2h << 16) | dword2l;
+
+                    // Take in two uint32_t dwords and one src2 dword. The
+                    // function will need to call bits to break up to the
+                    // correct size and then reinterpret cast to the correct
+                    // value.
+                    D[lane] = fOpImpl(dword1, dword2, S2[lane], clamp);
+                }
+            }
+
+            D.write();
+        }
+
       private:
         bool hasSecondDword(InFmt_VOP3P *);
 
diff --git a/src/arch/amdgpu/vega/insts/vop3p.cc b/src/arch/amdgpu/vega/insts/vop3p.cc
index a356d3bab3..eddb1e7ad5 100644
--- a/src/arch/amdgpu/vega/insts/vop3p.cc
+++ b/src/arch/amdgpu/vega/insts/vop3p.cc
@@ -42,6 +42,38 @@ namespace VegaISA
 using half = uint16_t;
 
 // Helper functions
+template<int N>
+int32_t
+dotClampI(int32_t value, bool clamp)
+{
+    // Only valid for N < 32
+    static_assert(N < 32);
+
+    if (!clamp) {
+        return static_cast<int32_t>(value);
+    }
+
+    int32_t min = -(1 << (N - 1));
+    int32_t max = (1 << (N - 1)) - 1;
+    return std::clamp<int32_t>(value, min, max);
+}
+
+template<int N>
+uint32_t
+dotClampU(uint32_t value, bool clamp)
+{
+    // Only valid for N < 32
+    static_assert(N < 32);
+
+    if (!clamp) {
+        return static_cast<int32_t>(value);
+    }
+
+    uint32_t min = 0;
+    uint32_t max = (1 << N) - 1;
+    return std::clamp<int32_t>(value, min, max);
+}
+
 int16_t
 clampI16(int32_t value, bool clamp)
 {
@@ -83,6 +115,16 @@ clampF16(uint16_t value, bool clamp)
     return fplibMax(imm, zero, fpscr2);
 }
 
+float
+clampF32(float value, bool clamp)
+{
+    if (!clamp) {
+        return value;
+    }
+
+    return std::clamp(value, 0.0f, 1.0f);
+}
+
 
 
 
@@ -298,5 +340,296 @@ void Inst_VOP3P__V_PK_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
     vop3pHelper<half>(gpuDynInst, opImpl);
 }
 
+void Inst_VOP3P__V_DOT2_F32_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 16;
+
+        constexpr unsigned elems = 32 / INBITS;
+        half S0[elems];
+        half S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        float S2 = *reinterpret_cast<float*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        half C[elems];
+        float Csum = 0.0f;
+
+        for (int i = 0; i < elems; ++i) {
+            ArmISA::FPSCR fpscr;
+            C[i] = fplibMul(S0[i], S1[i], fpscr);
+            uint32_t conv =
+                ArmISA::fplibConvert<uint16_t, uint32_t>(
+                        C[i], ArmISA::FPRounding_TIEEVEN, fpscr);
+            Csum += clampF32(*reinterpret_cast<float*>(&conv), clamp);
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT2_I32_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 16;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        int32_t C[elems];
+        int32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
+            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
+            Csum += C[i];
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT2_U32_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 16;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        // Compute components individually to prevent overflow across packing
+        uint32_t C[elems];
+        uint32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = S0[i] * S1[i];
+            C[i] = dotClampU<INBITS>(C[i], clamp);
+            Csum += C[i];
+        }
+
+        Csum += S2;
+
+        return Csum;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT4_I32_I8::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 8;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        int32_t C[elems];
+        int32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
+            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
+            Csum += C[i];
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT4_U32_U8::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 8;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        // Compute components individually to prevent overflow across packing
+        uint32_t C[elems];
+        uint32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = S0[i] * S1[i];
+            C[i] = dotClampU<INBITS>(C[i], clamp);
+            Csum += C[i];
+        }
+
+        Csum += S2;
+
+        return Csum;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT8_I32_I4::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 4;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        int32_t C[elems];
+        int32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
+            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
+            Csum += C[i];
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT8_U32_U4::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 4;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        // Compute components individually to prevent overflow across packing
+        uint32_t C[elems];
+        uint32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = S0[i] * S1[i];
+            C[i] = dotClampU<INBITS>(C[i], clamp);
+            Csum += C[i];
+        }
+
+        Csum += S2;
+
+        return Csum;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst)
+{
+    // The Acc register file is not supported in gem5 and has been removed
+    // in MI200. Therefore this instruction becomes a mov.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+    VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+    src.readSrc();
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            vdst[lane] = src[lane];
+        }
+    }
+
+    vdst.write();
+}
+
+void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst)
+{
+    // The Acc register file is not supported in gem5 and has been removed
+    // in MI200. Therefore this instruction becomes a mov.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+    VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+    src.readSrc();
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            vdst[lane] = src[lane];
+        }
+    }
+
+    vdst.write();
+}
+
 } // namespace VegaISA
 } // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3p.hh b/src/arch/amdgpu/vega/insts/vop3p.hh
index 56f0f80435..fbb81f12f7 100644
--- a/src/arch/amdgpu/vega/insts/vop3p.hh
+++ b/src/arch/amdgpu/vega/insts/vop3p.hh
@@ -42,6 +42,41 @@ namespace gem5
 
 namespace VegaISA
 {
+    // One source operand
+    class Inst_VOP3P__1OP : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__1OP(InFmt_VOP3P *iFmt, const std::string& name)
+            : Inst_VOP3P(iFmt, name)
+        {
+            setFlag(ALU);
+        }
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 1; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src
+                return 4;
+              case 1: // dst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        }
+
+    };
+
     // Two source operands with two 16-bit values in a dword
     class Inst_VOP3P__2OP_X16 : public Inst_VOP3P
     {
@@ -310,6 +345,96 @@ namespace VegaISA
 
         void execute(GPUDynInstPtr gpuDynInst) override;
     };
+
+    class Inst_VOP3P__V_DOT2_F32_F16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT2_F32_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_f32_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT2_I32_I16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT2_I32_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_i32_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT2_U32_U16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT2_U32_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_u32_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT4_I32_I8 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT4_I32_I8(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot4_i32_i8")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT4_U32_U8 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT4_U32_U8(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot4_u32_u8")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT8_I32_I4 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT8_I32_I4(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot8_i32_i4")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT8_U32_U4 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT8_U32_U4(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot8_u32_u4")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_ACCVGPR_READ : public Inst_VOP3P__1OP
+    {
+      public:
+        Inst_VOP3P__V_ACCVGPR_READ(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__1OP(iFmt, "v_accvgpr_read")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_ACCVGPR_WRITE : public Inst_VOP3P__1OP
+    {
+      public:
+        Inst_VOP3P__V_ACCVGPR_WRITE(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__1OP(iFmt, "v_accvgpr_write")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
 } // namespace VegaISA
 } // namespace gem5