arch-vega: Add vop3p DOT instructions

Implemented according to the ISA spec. Validated with silion. In particular the sign extend is important for the signed variants and the unsigned variants seem to overflow lanes (hence why there is no mask() in the unsigned varints. FP16 -> FP32 continues using ARM's fplib. Tested vs. an MI210. Clamp has not been verified. Change-Id: Ifc09aecbc1ef2c92a5524a43ca529983018a6d59
2023-12-12 00:59:33 -06:00
parent a40f8f0efa
commit 31e63b01ad
5 changed files with 580 additions and 9 deletions
--- a/src/arch/amdgpu/vega/decoder.cc
+++ b/src/arch/amdgpu/vega/decoder.cc
@@ -3614,15 +3614,15 @@ namespace VegaISA
        &Decoder::decode_OP_VOP3P__V_MAD_MIX_F32,
        &Decoder::decode_OP_VOP3P__V_MAD_MIXLO_F16,
        &Decoder::decode_OP_VOP3P__V_MAD_MIXHI_F16,
+        &Decoder::decode_OP_VOP3P__V_DOT2_F32_F16,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_DOT2_I32_I16,
+        &Decoder::decode_OP_VOP3P__V_DOT2_U32_U16,
+        &Decoder::decode_OP_VOP3P__V_DOT4_I32_I8,
+        &Decoder::decode_OP_VOP3P__V_DOT4_U32_U8,
+        &Decoder::decode_OP_VOP3P__V_DOT8_I32_I4,
+        &Decoder::decode_OP_VOP3P__V_DOT8_U32_U4,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
@@ -3667,8 +3667,8 @@ namespace VegaISA
        &Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X16I8,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_ACCVGPR_READ,
+        &Decoder::decode_OP_VOP3P__V_ACCVGPR_WRITE,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
@@ -13125,6 +13125,48 @@ namespace VegaISA
        return new Inst_VOP3P__V_PK_MOV_B32(&iFmt->iFmt_VOP3P);
    }

+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT2_F32_F16(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT2_F32_F16(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT2_I32_I16(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT2_I32_I16(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT2_U32_U16(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT2_U32_U16(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT4_I32_I8(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT4_I32_I8(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT4_U32_U8(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT4_U32_U8(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT8_I32_I4(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT8_I32_I4(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT8_U32_U4(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT8_U32_U4(&iFmt->iFmt_VOP3P);
+    }
+
    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst iFmt)
    {
@@ -13132,6 +13174,18 @@ namespace VegaISA
                &iFmt->iFmt_VOP3P_MAI);
    }

+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_ACCVGPR_READ(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_ACCVGPR_READ(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_ACCVGPR_WRITE(&iFmt->iFmt_VOP3P);
+    }
+
    GPUStaticInst*
    Decoder::decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst iFmt)
    {
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -1597,7 +1597,16 @@ namespace VegaISA
        GPUStaticInst* decode_OP_VOP3P__V_PK_MUL_F32(MachInst);
        GPUStaticInst* decode_OP_VOP3P__V_PK_ADD_F32(MachInst);
        GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_F32_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_I32_I16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_U32_U16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT4_I32_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT4_U32_U8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT8_I32_I4(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT8_U32_U4(MachInst);
        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_READ(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst);
        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst);
        GPUStaticInst* subDecode_OPU_VOP3(MachInst);
        GPUStaticInst* subDecode_OP_DS(MachInst);
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -588,6 +588,56 @@ namespace VegaISA
            D.write();
        }

+        void
+        dotHelper(GPUDynInstPtr gpuDynInst,
+                  uint32_t (*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+            ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
+            ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
+            ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
+            VecOperandU32 D(gpuDynInst, instData.VDST);
+
+            S0.readSrc();
+            S1.readSrc();
+            S2.readSrc();
+
+            // OPSEL[2] and OPSEL_HI2 are unused. Craft two dwords where:
+            // dword1[15:0]  is upper/lower 16b of src0 based on opsel[0]
+            // dword1[31:15] is upper/lower 16b of src0 based on opsel_hi[0]
+            // dword2[15:0]  is upper/lower 16b of src1 based on opsel[1]
+            // dword2[31:15] is upper/lower 16b of src1 based on opsel_hi[1]
+            int opLo = instData.OPSEL;
+            int opHi = extData.OPSEL_HI;
+            int negLo = extData.NEG;
+            int negHi = instData.NEG_HI;
+            bool clamp = instData.CLMP;
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    uint32_t dword1l =
+                        word<uint16_t>(S0[lane], opLo, negLo, 0);
+                    uint32_t dword1h =
+                        word<uint16_t>(S0[lane], opHi, negHi, 0);
+                    uint32_t dword2l =
+                        word<uint16_t>(S1[lane], opLo, negLo, 1);
+                    uint32_t dword2h =
+                        word<uint16_t>(S1[lane], opHi, negHi, 1);
+
+                    uint32_t dword1 = (dword1h << 16) | dword1l;
+                    uint32_t dword2 = (dword2h << 16) | dword2l;
+
+                    // Take in two uint32_t dwords and one src2 dword. The
+                    // function will need to call bits to break up to the
+                    // correct size and then reinterpret cast to the correct
+                    // value.
+                    D[lane] = fOpImpl(dword1, dword2, S2[lane], clamp);
+                }
+            }
+
+            D.write();
+        }
+
      private:
        bool hasSecondDword(InFmt_VOP3P *);

--- a/src/arch/amdgpu/vega/insts/vop3p.cc
+++ b/src/arch/amdgpu/vega/insts/vop3p.cc
@@ -42,6 +42,38 @@ namespace VegaISA
 using half = uint16_t;

 // Helper functions
+template<int N>
+int32_t
+dotClampI(int32_t value, bool clamp)
+{
+    // Only valid for N < 32
+    static_assert(N < 32);
+
+    if (!clamp) {
+        return static_cast<int32_t>(value);
+    }
+
+    int32_t min = -(1 << (N - 1));
+    int32_t max = (1 << (N - 1)) - 1;
+    return std::clamp<int32_t>(value, min, max);
+}
+
+template<int N>
+uint32_t
+dotClampU(uint32_t value, bool clamp)
+{
+    // Only valid for N < 32
+    static_assert(N < 32);
+
+    if (!clamp) {
+        return static_cast<int32_t>(value);
+    }
+
+    uint32_t min = 0;
+    uint32_t max = (1 << N) - 1;
+    return std::clamp<int32_t>(value, min, max);
+}
+
 int16_t
 clampI16(int32_t value, bool clamp)
 {
@@ -83,6 +115,16 @@ clampF16(uint16_t value, bool clamp)
    return fplibMax(imm, zero, fpscr2);
 }

+float
+clampF32(float value, bool clamp)
+{
+    if (!clamp) {
+        return value;
+    }
+
+    return std::clamp(value, 0.0f, 1.0f);
+}
+



@@ -298,5 +340,296 @@ void Inst_VOP3P__V_PK_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
    vop3pHelper<half>(gpuDynInst, opImpl);
 }

+void Inst_VOP3P__V_DOT2_F32_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 16;
+
+        constexpr unsigned elems = 32 / INBITS;
+        half S0[elems];
+        half S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        float S2 = *reinterpret_cast<float*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        half C[elems];
+        float Csum = 0.0f;
+
+        for (int i = 0; i < elems; ++i) {
+            ArmISA::FPSCR fpscr;
+            C[i] = fplibMul(S0[i], S1[i], fpscr);
+            uint32_t conv =
+                ArmISA::fplibConvert<uint16_t, uint32_t>(
+                        C[i], ArmISA::FPRounding_TIEEVEN, fpscr);
+            Csum += clampF32(*reinterpret_cast<float*>(&conv), clamp);
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT2_I32_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 16;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        int32_t C[elems];
+        int32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
+            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
+            Csum += C[i];
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT2_U32_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 16;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        // Compute components individually to prevent overflow across packing
+        uint32_t C[elems];
+        uint32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = S0[i] * S1[i];
+            C[i] = dotClampU<INBITS>(C[i], clamp);
+            Csum += C[i];
+        }
+
+        Csum += S2;
+
+        return Csum;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT4_I32_I8::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 8;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        int32_t C[elems];
+        int32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
+            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
+            Csum += C[i];
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT4_U32_U8::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 8;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        // Compute components individually to prevent overflow across packing
+        uint32_t C[elems];
+        uint32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = S0[i] * S1[i];
+            C[i] = dotClampU<INBITS>(C[i], clamp);
+            Csum += C[i];
+        }
+
+        Csum += S2;
+
+        return Csum;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT8_I32_I4::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 4;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        int32_t C[elems];
+        int32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
+            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
+            Csum += C[i];
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT8_U32_U4::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 4;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        // Compute components individually to prevent overflow across packing
+        uint32_t C[elems];
+        uint32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = S0[i] * S1[i];
+            C[i] = dotClampU<INBITS>(C[i], clamp);
+            Csum += C[i];
+        }
+
+        Csum += S2;
+
+        return Csum;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst)
+{
+    // The Acc register file is not supported in gem5 and has been removed
+    // in MI200. Therefore this instruction becomes a mov.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+    VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+    src.readSrc();
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            vdst[lane] = src[lane];
+        }
+    }
+
+    vdst.write();
+}
+
+void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst)
+{
+    // The Acc register file is not supported in gem5 and has been removed
+    // in MI200. Therefore this instruction becomes a mov.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+    VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+    src.readSrc();
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            vdst[lane] = src[lane];
+        }
+    }
+
+    vdst.write();
+}
+
 } // namespace VegaISA
 } // namespace gem5
--- a/src/arch/amdgpu/vega/insts/vop3p.hh
+++ b/src/arch/amdgpu/vega/insts/vop3p.hh
@@ -42,6 +42,41 @@ namespace gem5

 namespace VegaISA
 {
+    // One source operand
+    class Inst_VOP3P__1OP : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__1OP(InFmt_VOP3P *iFmt, const std::string& name)
+            : Inst_VOP3P(iFmt, name)
+        {
+            setFlag(ALU);
+        }
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 1; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src
+                return 4;
+              case 1: // dst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        }
+
+    };
+
    // Two source operands with two 16-bit values in a dword
    class Inst_VOP3P__2OP_X16 : public Inst_VOP3P
    {
@@ -310,6 +345,96 @@ namespace VegaISA

        void execute(GPUDynInstPtr gpuDynInst) override;
    };
+
+    class Inst_VOP3P__V_DOT2_F32_F16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT2_F32_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_f32_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT2_I32_I16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT2_I32_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_i32_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT2_U32_U16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT2_U32_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_u32_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT4_I32_I8 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT4_I32_I8(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot4_i32_i8")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT4_U32_U8 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT4_U32_U8(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot4_u32_u8")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT8_I32_I4 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT8_I32_I4(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot8_i32_i4")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT8_U32_U4 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT8_U32_U4(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot8_u32_u4")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_ACCVGPR_READ : public Inst_VOP3P__1OP
+    {
+      public:
+        Inst_VOP3P__V_ACCVGPR_READ(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__1OP(iFmt, "v_accvgpr_read")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_ACCVGPR_WRITE : public Inst_VOP3P__1OP
+    {
+      public:
+        Inst_VOP3P__V_ACCVGPR_WRITE(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__1OP(iFmt, "v_accvgpr_write")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
 } // namespace VegaISA
 } // namespace gem5