arch-vega: Improve SDWA, SDWAB, and DPP (#1378)

This PR has four components: - Implement a helper method for SDWAB similar to SDWA helpers. SDWAB is used for VOPC instructions only (vector compares). - Update two instructions commonly using SDWAB to use helper (v_cmp_ne_u16 and v_cmp_eq_u16). - Add panics to *all* VOP1 and VOP2 instructions which do not implement SDWA or DPP if they use an SDWA or DPP register. - Add panics to *all* VOPC instructions which do not implement SDWAB or DPP if they are an SDWA or DPP register. Only VOP1, VOP2, and VOPC may use SDWA, SDWAB, or DPP. The panics should therefore cover all instructions which have missing implementations for these modes. The intent is to exit gem5 instead of continuing simulation will data that is likely incorrect. Continuing simulation only makes debugging gem5 more difficult.
2024-07-26 07:12:10 -07:00
parent b3b289ae81 21f6e166b7
commit 37ca94450a
4 changed files with 862 additions and 11 deletions
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -35,6 +35,7 @@
 #include "arch/amdgpu/vega/gpu_decoder.hh"
 #include "arch/amdgpu/vega/gpu_mem_helpers.hh"
 #include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
+#include "arch/amdgpu/vega/insts/inst_util.hh"
 #include "arch/amdgpu/vega/operand.hh"
 #include "debug/GPUExec.hh"
 #include "debug/VEGA.hh"
@@ -421,6 +422,159 @@ namespace VegaISA
        InstFormat extData;
        uint32_t varSize;

+        template<typename T>
+        uint32_t
+        sdwabSelect(uint32_t dword, const SDWASelVals sel,
+                    bool sign_ext, bool neg, bool abs)
+        {
+            // Use the gem5 bits() helper to select a sub region from the
+            // dword based on the select. Return a 32-bit unsigned which will
+            // be cast to the appropriate compare type in the lambda passed to
+            // sdwabHelper.
+            int low_bit = 0, high_bit = 0;
+            uint32_t rv = dword;
+
+            if (sel < SDWA_WORD_0) {
+                // Selecting a sub-dword value smaller than a word (i.e., a
+                // byte). These values are 0-3 so multiplying by BITS_PER_BYTE
+                // gives the lower and upper bit easily.
+                low_bit = sel * VegaISA::BITS_PER_BYTE;
+                high_bit = low_bit + VegaISA::BITS_PER_BYTE - 1;
+            } else if (sel < SDWA_DWORD) {
+                // Selecting a sub-dword value of word size. Enum value is 4
+                // or 5, so selecting the LSb and multiplying gives the lower
+                // and upper bit.
+                low_bit = (sel & 1) * VegaISA::BITS_PER_WORD;
+                high_bit = low_bit + VegaISA::MSB_PER_WORD - 1;
+            } else {
+                // We are selecting the whole dword. Assert that is true and
+                // set the bit locations for lower and upper based on dword
+                // size.
+                assert(sel == SDWA_DWORD);
+                low_bit = 0;
+                high_bit = sizeof(uint32_t) * VegaISA::BITS_PER_BYTE - 1;
+            }
+
+            rv = bits(dword, high_bit, low_bit);
+
+            uint32_t sign_bit = 1 << high_bit;
+
+            // Panic on combinations which do not make sense.
+            if (std::is_integral_v<T> && std::is_unsigned_v<T>) {
+                panic_if(neg, "SWDAB negation operation on unsigned type!\n");
+                panic_if(sign_ext, "SWDAB sign extend on unsigned type!\n");
+            }
+
+            // Apply ABS, then NEG, then SEXT.
+            if (abs) {
+                if (std::is_integral_v<T>) {
+                    // If sign is set, sign extend first then call std::abs.
+                    if ((rv & sign_bit) && std::is_signed_v<T>) {
+                        rv = sext(rv, high_bit + 1) & 0xFFFFFFFF;
+                        rv = std::abs(static_cast<long long>(rv)) & 0xFFFFFFFF;
+                    }
+                } else {
+                    // Clear sign bit for FP types.
+                    rv = rv & mask(high_bit);
+                }
+            }
+
+            if (neg) {
+                if (std::is_integral_v<T>) {
+                    // If sign is set, sign extend first then call unary-.
+                    if (rv & sign_bit) {
+                        rv = sext(rv, high_bit + 1) & 0xFFFFFFFF;
+                        rv = -rv;
+                    }
+                } else {
+                    // Flip sign bit for FP types.
+                    rv = rv ^ mask(high_bit);
+                }
+            }
+
+            if (sign_ext) {
+                if (std::is_integral_v<T>) {
+                    if (rv & sign_bit) {
+                        rv = sext(rv, high_bit + 1) & 0xFFFFFFFF;
+                    }
+                } else {
+                    // It is not entirely clear what to do here. Literal
+                    // extensions for FP operands append zeros to mantissa
+                    // but specification does not state anything for SDWAB.
+                    panic("SDWAB sign extend set for non-integral type!\n");
+                }
+            }
+
+            return rv;
+        }
+
+        template<typename T>
+        void
+        sdwabHelper(GPUDynInstPtr gpuDynInst, int (*cmpFunc)(T, T))
+        {
+            DPRINTF(VEGA, "Handling %s SRC SDWA. SRC0: register %s[%d], "
+                    "sDst s[%d], sDst type %s, SRC0_SEL: %d, SRC0_SEXT: %d "
+                    "SRC0_NEG: %d, SRC0_ABS: %d, SRC1: register %s[%d], "
+                    "SRC1_SEL: %d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: "
+                    "%d\n", _opcode.c_str(),
+                    (extData.iFmt_VOP_SDWAB.S0 ? "s" : "v"),
+                    extData.iFmt_VOP_SDWAB.SRC0,
+                    extData.iFmt_VOP_SDWAB.SDST,
+                    (extData.iFmt_VOP_SDWAB.SD ? "SGPR" : "VCC"),
+                    extData.iFmt_VOP_SDWAB.SRC0_SEL,
+                    extData.iFmt_VOP_SDWAB.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWAB.SRC0_NEG,
+                    extData.iFmt_VOP_SDWAB.SRC0_ABS,
+                    (extData.iFmt_VOP_SDWAB.S1 ? "s" : "v"),
+                    instData.VSRC1,
+                    extData.iFmt_VOP_SDWAB.SRC1_SEL,
+                    extData.iFmt_VOP_SDWAB.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWAB.SRC1_NEG,
+                    extData.iFmt_VOP_SDWAB.SRC1_ABS);
+
+            // Start with SRC0 and insert 9th bit for VGPR source (S0 == 0).
+            int src0_idx = extData.iFmt_VOP_SDWAB.SRC0;
+            src0_idx += (extData.iFmt_VOP_SDWAB.S0 == 0) ? 0x100 : 0;
+
+            // Start with VSRC1[7:0], insert 9th bit for VGPR source (S1 == 0).
+            int src1_idx = instData.VSRC1;
+            src1_idx += (extData.iFmt_VOP_SDWAB.S1 == 0) ? 0x100 : 0;
+
+            // SD == 0 if VCC is dest, else use SDST index.
+            int sdst_idx = (extData.iFmt_VOP_SDWAB.SD == 1) ?
+                int(extData.iFmt_VOP_SDWAB.SDST) : REG_VCC_LO;
+
+            ConstVecOperandU32 src0(gpuDynInst, src0_idx);
+            ConstVecOperandU32 src1(gpuDynInst, src1_idx);
+            ScalarOperandU64 sdst(gpuDynInst, sdst_idx);
+
+            // Use readSrc in case of scalar const register.
+            src0.readSrc();
+            src1.readSrc();
+
+            // Select bits first, then cast to type, then apply modifiers.
+            const SDWASelVals src0_sel =
+                (SDWASelVals)extData.iFmt_VOP_SDWAB.SRC0_SEL;
+            const SDWASelVals src1_sel =
+                (SDWASelVals)extData.iFmt_VOP_SDWAB.SRC1_SEL;
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->wavefront()->execMask(lane)) {
+                    T a = sdwabSelect<T>(src0[lane], src0_sel,
+                                         extData.iFmt_VOP_SDWAB.SRC0_SEXT,
+                                         extData.iFmt_VOP_SDWAB.SRC0_NEG,
+                                         extData.iFmt_VOP_SDWAB.SRC0_ABS);
+                    T b = sdwabSelect<T>(src1[lane], src1_sel,
+                                         extData.iFmt_VOP_SDWAB.SRC1_SEXT,
+                                         extData.iFmt_VOP_SDWAB.SRC1_NEG,
+                                         extData.iFmt_VOP_SDWAB.SRC1_ABS);
+                    sdst.setBit(lane, cmpFunc(a, b));
+                }
+            }
+
+            sdst.write();
+        }
+
      private:
        bool hasSecondDword(InFmt_VOPC *);
    }; // Inst_VOPC
--- a/src/arch/amdgpu/vega/insts/vop1.cc
+++ b/src/arch/amdgpu/vega/insts/vop1.cc
@@ -80,6 +80,8 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+
        if (isDPPInst()) {
            VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
            src_dpp.read();
@@ -148,6 +150,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        if (exec_mask) {
            src_lane = findLsbSet(exec_mask);
        }
@@ -182,6 +187,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int exp;
@@ -226,6 +234,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF64)src[lane];
@@ -258,6 +269,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)src[lane];
@@ -290,6 +304,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)src[lane];
@@ -324,6 +341,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int exp;
@@ -372,6 +392,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int exp;
@@ -439,6 +462,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                float tmp = src[lane];
@@ -475,6 +501,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                AMDGPU::mxfloat16 tmp(src[lane]);
@@ -509,6 +538,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
@@ -542,6 +574,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemI32)std::floor(src[lane]);
@@ -595,6 +630,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)src[lane];
@@ -627,6 +665,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF64)src[lane];
@@ -659,6 +700,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)(bits(src[lane], 7, 0));
@@ -691,6 +735,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)(bits(src[lane], 15, 8));
@@ -723,6 +770,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)(bits(src[lane], 23, 16));
@@ -755,6 +805,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)(bits(src[lane], 31, 24));
@@ -789,6 +842,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int exp;
@@ -835,6 +891,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF64)src[lane];
@@ -867,6 +926,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::trunc(src[lane]);
@@ -900,6 +962,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::ceil(src[lane]);
@@ -932,6 +997,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = roundNearestEven(src[lane]);
@@ -965,6 +1033,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::floor(src[lane]);
@@ -997,6 +1068,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemF32 int_part(0.0);
@@ -1030,6 +1104,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::trunc(src[lane]);
@@ -1063,6 +1140,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::ceil(src[lane]);
@@ -1095,6 +1175,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = roundNearestEven(src[lane]);
@@ -1128,6 +1211,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::floor(src[lane]);
@@ -1160,6 +1246,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::pow(2.0, src[lane]);
@@ -1192,6 +1281,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::log2(src[lane]);
@@ -1224,6 +1316,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = 1.0 / src[lane];
@@ -1258,6 +1353,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = 1.0 / src[lane];
@@ -1290,6 +1388,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = 1.0 / std::sqrt(src[lane]);
@@ -1322,6 +1423,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::fpclassify(src[lane]) == FP_ZERO) {
@@ -1366,6 +1470,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::fpclassify(src[lane]) == FP_ZERO) {
@@ -1409,6 +1516,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::sqrt(src[lane]);
@@ -1441,6 +1551,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::sqrt(src[lane]);
@@ -1477,6 +1590,9 @@ namespace VegaISA
        src.readSrc();
        pi.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (src[lane] < -256.0 || src[lane] > 256.0) {
@@ -1517,6 +1633,9 @@ namespace VegaISA
        src.readSrc();
        pi.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (src[lane] < -256.0 || src[lane] > 256.0) {
@@ -1553,6 +1672,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = ~src[lane];
@@ -1585,6 +1707,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = reverseBits(src[lane]);
@@ -1617,6 +1742,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = findFirstOneMsb(src[lane]);
@@ -1649,6 +1777,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = findFirstOne(src[lane]);
@@ -1681,6 +1812,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = firstOppositeSignBit(src[lane]);
@@ -1714,6 +1848,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
@@ -1752,6 +1889,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
@@ -1789,6 +1929,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemF64 int_part(0.0);
@@ -1827,6 +1970,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
@@ -1870,6 +2016,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
@@ -1926,8 +2075,8 @@ namespace VegaISA

        src.readSrc();

-        panic_if(isDPPInst(), "DPP unimplemented for v_mov_b64");
-        panic_if(isSDWAInst(), "SDWA unimplemented for v_mov_b64");
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
@@ -2359,6 +2508,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::pow(2.0, src[lane]);
@@ -2391,6 +2543,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::log2(src[lane]);
@@ -2423,6 +2578,9 @@ namespace VegaISA

        src.readSrc();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src[lane];
--- a/src/arch/amdgpu/vega/insts/vop2.cc
+++ b/src/arch/amdgpu/vega/insts/vop2.cc
@@ -67,6 +67,9 @@ namespace VegaISA
        src1.read();
        vcc.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane]
@@ -102,6 +105,8 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+
        if (isDPPInst()) {
            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
            src0_dpp.read();
@@ -163,6 +168,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] - src1[lane];
@@ -198,6 +206,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] - src0[lane];
@@ -232,6 +243,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] * src1[lane];
@@ -266,6 +280,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isnan(src0[lane]) ||
@@ -344,6 +361,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
@@ -378,6 +398,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemI64 tmp_src0
@@ -445,6 +468,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
@@ -481,6 +507,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fmin(src0[lane], src1[lane]);
@@ -515,6 +544,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fmax(src0[lane], src1[lane]);
@@ -548,6 +580,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::min(src0[lane], src1[lane]);
@@ -581,6 +616,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::max(src0[lane], src1[lane]);
@@ -614,6 +652,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::min(src0[lane], src1[lane]);
@@ -647,6 +688,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::max(src0[lane], src1[lane]);
@@ -682,6 +726,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
@@ -717,6 +764,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
@@ -751,6 +801,8 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        if (isSDWAInst()) {
            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
            // use copies of original src0, src1, and vdst during selecting
@@ -826,6 +878,8 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+
        if (isDPPInst()) {
            VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
            src0_dpp.read();
@@ -886,6 +940,8 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        if (isSDWAInst()) {
            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
            // use copies of original src0, src1, and dest during selecting
@@ -961,6 +1017,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] ^ src1[lane];
@@ -998,6 +1057,8 @@ namespace VegaISA
        src1.read();
        vdst.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+
        if (isDPPInst()) {
            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
            src0_dpp.read();
@@ -1064,6 +1125,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fma(src0[lane], k, src1[lane]);
@@ -1103,6 +1167,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fma(src0[lane], src1[lane], k);
@@ -1141,6 +1208,8 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        if (isSDWAInst()) {
            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
            // use copies of original src0, src1, and dest during selecting
@@ -1225,6 +1294,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] - src1[lane];
@@ -1265,6 +1337,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] - src0[lane];
@@ -1308,6 +1383,9 @@ namespace VegaISA
        src1.read();
        vcc.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] + src1[lane]
@@ -1355,6 +1433,9 @@ namespace VegaISA
        src1.read();
        vcc.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane]
@@ -1401,6 +1482,9 @@ namespace VegaISA
        src1.read();
        vcc.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane]
@@ -1598,6 +1682,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] + src1[lane];
@@ -1632,6 +1719,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] - src1[lane];
@@ -1667,6 +1757,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] - src0[lane];
@@ -1701,6 +1794,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] * src1[lane];
@@ -1735,6 +1831,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
@@ -1770,6 +1869,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] >> src0[lane];
@@ -1805,6 +1907,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] >> src0[lane];
@@ -1882,6 +1987,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::max(src0[lane], src1[lane]);
@@ -1915,6 +2023,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::max(src0[lane], src1[lane]);
@@ -1948,6 +2059,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::min(src0[lane], src1[lane]);
@@ -1981,6 +2095,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::min(src0[lane], src1[lane]);
@@ -2034,6 +2151,8 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        if (isSDWAInst()) {
            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
            // use copies of original src0, src1, and dest during selecting
@@ -2108,6 +2227,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] - src1[lane];
@@ -2141,6 +2263,9 @@ namespace VegaISA
        src0.readSrc();
        src1.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] - src0[lane];
@@ -2175,6 +2300,9 @@ namespace VegaISA
        src1.read();
        vdst.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
@@ -2209,6 +2337,9 @@ namespace VegaISA
        src1.read();
        vdst.read();

+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = ~(src0[lane] ^ src1[lane]);
--- a/src/arch/amdgpu/vega/insts/vopc.cc
+++ b/src/arch/amdgpu/vega/insts/vopc.cc