From 69338703e7913f04576efe2f4d0f29c822aa5079 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 17 Jul 2024 15:01:10 -0700
Subject: [PATCH 1/4] arch-vega: Implement SDWAB helper

Implement a SDWAB helper which accepts a dynamic instruction and a
lambda function defining a comparison function taking two values and
returning a comparison result of 0 or 1 for false or true.

Current instructions which implement SDWA do so on a per-instruction
basis which adds a lot of redundant code. This allows for generic SDWAB
implementations for VOPC instructions.

All modifiers are implemented assuming that SDWBA VOPC instruction
comparison types may be U32, I32, F32, U16, I16, F16 (which exist) but
is extendible to I8, U8, or F8.

Change-Id: Idab58a327c29dd19a1a5457237f3799a04f2031b
---
 src/arch/amdgpu/vega/insts/op_encodings.hh | 154 +++++++++++++++++++++
 1 file changed, 154 insertions(+)
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh
index 504946534f..c8ad4ef3d3 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -35,6 +35,7 @@
 #include "arch/amdgpu/vega/gpu_decoder.hh"
 #include "arch/amdgpu/vega/gpu_mem_helpers.hh"
 #include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
+#include "arch/amdgpu/vega/insts/inst_util.hh"
 #include "arch/amdgpu/vega/operand.hh"
 #include "debug/GPUExec.hh"
 #include "debug/VEGA.hh"
@@ -421,6 +422,159 @@ namespace VegaISA
         InstFormat extData;
         uint32_t varSize;
 
+        template<typename T>
+        uint32_t
+        sdwabSelect(uint32_t dword, const SDWASelVals sel,
+                    bool sign_ext, bool neg, bool abs)
+        {
+            // Use the gem5 bits() helper to select a sub region from the
+            // dword based on the select. Return a 32-bit unsigned which will
+            // be cast to the appropriate compare type in the lambda passed to
+            // sdwabHelper.
+            int low_bit = 0, high_bit = 0;
+            uint32_t rv = dword;
+
+            if (sel < SDWA_WORD_0) {
+                // Selecting a sub-dword value smaller than a word (i.e., a
+                // byte). These values are 0-3 so multiplying by BITS_PER_BYTE
+                // gives the lower and upper bit easily.
+                low_bit = sel * VegaISA::BITS_PER_BYTE;
+                high_bit = low_bit + VegaISA::BITS_PER_BYTE - 1;
+            } else if (sel < SDWA_DWORD) {
+                // Selecting a sub-dword value of word size. Enum value is 4
+                // or 5, so selecting the LSb and multiplying gives the lower
+                // and upper bit.
+                low_bit = (sel & 1) * VegaISA::BITS_PER_WORD;
+                high_bit = low_bit + VegaISA::MSB_PER_WORD - 1;
+            } else {
+                // We are selecting the whole dword. Assert that is true and
+                // set the bit locations for lower and upper based on dword
+                // size.
+                assert(sel == SDWA_DWORD);
+                low_bit = 0;
+                high_bit = sizeof(uint32_t) * VegaISA::BITS_PER_BYTE - 1;
+            }
+
+            rv = bits(dword, high_bit, low_bit);
+
+            uint32_t sign_bit = 1 << high_bit;
+
+            // Panic on combinations which do not make sense.
+            if (std::is_integral_v<T> && std::is_unsigned_v<T>) {
+                panic_if(neg, "SWDAB negation operation on unsigned type!\n");
+                panic_if(sign_ext, "SWDAB sign extend on unsigned type!\n");
+            }
+
+            // Apply ABS, then NEG, then SEXT.
+            if (abs) {
+                if (std::is_integral_v<T>) {
+                    // If sign is set, sign extend first then call std::abs.
+                    if ((rv & sign_bit) && std::is_signed_v<T>) {
+                        rv = sext(rv, high_bit + 1) & 0xFFFFFFFF;
+                        rv = std::abs(static_cast<long long>(rv)) & 0xFFFFFFFF;
+                    }
+                } else {
+                    // Clear sign bit for FP types.
+                    rv = rv & mask(high_bit);
+                }
+            }
+
+            if (neg) {
+                if (std::is_integral_v<T>) {
+                    // If sign is set, sign extend first then call unary-.
+                    if (rv & sign_bit) {
+                        rv = sext(rv, high_bit + 1) & 0xFFFFFFFF;
+                        rv = -rv;
+                    }
+                } else {
+                    // Flip sign bit for FP types.
+                    rv = rv ^ mask(high_bit);
+                }
+            }
+
+            if (sign_ext) {
+                if (std::is_integral_v<T>) {
+                    if (rv & sign_bit) {
+                        rv = sext(rv, high_bit + 1) & 0xFFFFFFFF;
+                    }
+                } else {
+                    // It is not entirely clear what to do here. Literal
+                    // extensions for FP operands append zeros to mantissa
+                    // but specification does not state anything for SDWAB.
+                    panic("SDWAB sign extend set for non-integral type!\n");
+                }
+            }
+
+            return rv;
+        }
+
+        template<typename T>
+        void
+        sdwabHelper(GPUDynInstPtr gpuDynInst, int (*cmpFunc)(T, T))
+        {
+            DPRINTF(VEGA, "Handling %s SRC SDWA. SRC0: register %s[%d], "
+                    "sDst s[%d], sDst type %s, SRC0_SEL: %d, SRC0_SEXT: %d "
+                    "SRC0_NEG: %d, SRC0_ABS: %d, SRC1: register %s[%d], "
+                    "SRC1_SEL: %d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: "
+                    "%d\n", _opcode.c_str(),
+                    (extData.iFmt_VOP_SDWAB.S0 ? "s" : "v"),
+                    extData.iFmt_VOP_SDWAB.SRC0,
+                    extData.iFmt_VOP_SDWAB.SDST,
+                    (extData.iFmt_VOP_SDWAB.SD ? "SGPR" : "VCC"),
+                    extData.iFmt_VOP_SDWAB.SRC0_SEL,
+                    extData.iFmt_VOP_SDWAB.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWAB.SRC0_NEG,
+                    extData.iFmt_VOP_SDWAB.SRC0_ABS,
+                    (extData.iFmt_VOP_SDWAB.S1 ? "s" : "v"),
+                    instData.VSRC1,
+                    extData.iFmt_VOP_SDWAB.SRC1_SEL,
+                    extData.iFmt_VOP_SDWAB.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWAB.SRC1_NEG,
+                    extData.iFmt_VOP_SDWAB.SRC1_ABS);
+
+            // Start with SRC0 and insert 9th bit for VGPR source (S0 == 0).
+            int src0_idx = extData.iFmt_VOP_SDWAB.SRC0;
+            src0_idx += (extData.iFmt_VOP_SDWAB.S0 == 0) ? 0x100 : 0;
+
+            // Start with VSRC1[7:0], insert 9th bit for VGPR source (S1 == 0).
+            int src1_idx = instData.VSRC1;
+            src1_idx += (extData.iFmt_VOP_SDWAB.S1 == 0) ? 0x100 : 0;
+
+            // SD == 0 if VCC is dest, else use SDST index.
+            int sdst_idx = (extData.iFmt_VOP_SDWAB.SD == 1) ?
+                int(extData.iFmt_VOP_SDWAB.SDST) : REG_VCC_LO;
+
+            ConstVecOperandU32 src0(gpuDynInst, src0_idx);
+            ConstVecOperandU32 src1(gpuDynInst, src1_idx);
+            ScalarOperandU64 sdst(gpuDynInst, sdst_idx);
+
+            // Use readSrc in case of scalar const register.
+            src0.readSrc();
+            src1.readSrc();
+
+            // Select bits first, then cast to type, then apply modifiers.
+            const SDWASelVals src0_sel =
+                (SDWASelVals)extData.iFmt_VOP_SDWAB.SRC0_SEL;
+            const SDWASelVals src1_sel =
+                (SDWASelVals)extData.iFmt_VOP_SDWAB.SRC1_SEL;
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->wavefront()->execMask(lane)) {
+                    T a = sdwabSelect<T>(src0[lane], src0_sel,
+                                         extData.iFmt_VOP_SDWAB.SRC0_SEXT,
+                                         extData.iFmt_VOP_SDWAB.SRC0_NEG,
+                                         extData.iFmt_VOP_SDWAB.SRC0_ABS);
+                    T b = sdwabSelect<T>(src1[lane], src1_sel,
+                                         extData.iFmt_VOP_SDWAB.SRC1_SEXT,
+                                         extData.iFmt_VOP_SDWAB.SRC1_NEG,
+                                         extData.iFmt_VOP_SDWAB.SRC1_ABS);
+                    sdst.setBit(lane, cmpFunc(a, b));
+                }
+            }
+
+            sdst.write();
+        }
+
       private:
         bool hasSecondDword(InFmt_VOPC *);
     }; // Inst_VOPC

From 6558821e2d15d177507ab56c629679c8edfc06ed Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 17 Jul 2024 15:23:54 -0700
Subject: [PATCH 2/4] arch-vega: Add SDWAB for v_cmp_{eq,ne}_u16

This shows an example of how to use the previous commit which adds an
SDWAB helper. The execute() method of both are the same with the
exception of the lambda function passed to the helper method.

Change-Id: I5ffe361440b4020b9f7669c0ed946aa6b3bbec25
---
 src/arch/amdgpu/vega/insts/vopc.cc | 36 ++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/src/arch/amdgpu/vega/insts/vopc.cc b/src/arch/amdgpu/vega/insts/vopc.cc
index 2c386fec74..0e1fb04f75 100644
--- a/src/arch/amdgpu/vega/insts/vopc.cc
+++ b/src/arch/amdgpu/vega/insts/vopc.cc
@@ -3782,13 +3782,21 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
+        auto cmpImpl = [](uint16_t a, uint16_t b) { return a == b ? 1 : 0; };
 
-        vcc.write();
+        if (isSDWAInst()) {
+            sdwabHelper<uint16_t>(gpuDynInst, cmpImpl);
+        } else if (isDPPInst()) {
+            panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vcc.setBit(lane, cmpImpl(src0[lane], src1[lane]));
+                }
+            }
+
+            vcc.write();
+        }
     } // execute
     // --- Inst_VOPC__V_CMP_LE_U16 class methods ---
 
@@ -3881,10 +3889,20 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+        auto cmpImpl = [](uint16_t a, uint16_t b) { return a != b ? 1 : 0; };
+
+        if (isSDWAInst()) {
+            sdwabHelper<uint16_t>(gpuDynInst, cmpImpl);
+        } else if (isDPPInst()) {
+            panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vcc.setBit(lane, cmpImpl(src0[lane], src1[lane]));
+                }
             }
+
+            vcc.write();
         }
 
         vcc.write();

From b75fe56da5a92d703b5f3c841435ff87fe1f2b61 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 24 Jul 2024 17:32:37 -0700
Subject: [PATCH 3/4] arch-vega: Panic unimplemented SDWA/DPP for VOP1/VOP2

Add a panic if SDWA or DPP is used for an instruction which does not
implement support for it. If an application uses SDWA or DPP it likely
does not operate in the same way as the base instruction and therefore
gem5 should panic rather than continue. It is likely data is incorrect
which will make it more difficult to debug an application.

Change-Id: I68ac448b0d62941761ef4efa0169f95796270f48
---
 src/arch/amdgpu/vega/insts/vop1.cc | 162 ++++++++++++++++++++++++++++-
 src/arch/amdgpu/vega/insts/vop2.cc | 131 +++++++++++++++++++++++
 2 files changed, 291 insertions(+), 2 deletions(-)

diff --git a/src/arch/amdgpu/vega/insts/vop1.cc b/src/arch/amdgpu/vega/insts/vop1.cc
index f970923951..f3744b52f2 100644
--- a/src/arch/amdgpu/vega/insts/vop1.cc
+++ b/src/arch/amdgpu/vega/insts/vop1.cc
@@ -80,6 +80,8 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+
         if (isDPPInst()) {
             VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
             src_dpp.read();
@@ -148,6 +150,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         if (exec_mask) {
             src_lane = findLsbSet(exec_mask);
         }
@@ -182,6 +187,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int exp;
@@ -226,6 +234,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF64)src[lane];
@@ -258,6 +269,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)src[lane];
@@ -290,6 +304,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)src[lane];
@@ -324,6 +341,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int exp;
@@ -372,6 +392,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int exp;
@@ -439,6 +462,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 float tmp = src[lane];
@@ -475,6 +501,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 AMDGPU::mxfloat16 tmp(src[lane]);
@@ -509,6 +538,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
@@ -542,6 +574,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemI32)std::floor(src[lane]);
@@ -595,6 +630,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)src[lane];
@@ -627,6 +665,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF64)src[lane];
@@ -659,6 +700,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)(bits(src[lane], 7, 0));
@@ -691,6 +735,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)(bits(src[lane], 15, 8));
@@ -723,6 +770,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)(bits(src[lane], 23, 16));
@@ -755,6 +805,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF32)(bits(src[lane], 31, 24));
@@ -789,6 +842,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 int exp;
@@ -835,6 +891,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = (VecElemF64)src[lane];
@@ -867,6 +926,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::trunc(src[lane]);
@@ -900,6 +962,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::ceil(src[lane]);
@@ -932,6 +997,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = roundNearestEven(src[lane]);
@@ -965,6 +1033,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::floor(src[lane]);
@@ -997,6 +1068,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemF32 int_part(0.0);
@@ -1030,6 +1104,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::trunc(src[lane]);
@@ -1063,6 +1140,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::ceil(src[lane]);
@@ -1095,6 +1175,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = roundNearestEven(src[lane]);
@@ -1128,6 +1211,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::floor(src[lane]);
@@ -1160,6 +1246,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::pow(2.0, src[lane]);
@@ -1192,6 +1281,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::log2(src[lane]);
@@ -1224,6 +1316,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = 1.0 / src[lane];
@@ -1258,6 +1353,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = 1.0 / src[lane];
@@ -1290,6 +1388,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = 1.0 / std::sqrt(src[lane]);
@@ -1322,6 +1423,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::fpclassify(src[lane]) == FP_ZERO) {
@@ -1366,6 +1470,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::fpclassify(src[lane]) == FP_ZERO) {
@@ -1409,6 +1516,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::sqrt(src[lane]);
@@ -1441,6 +1551,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::sqrt(src[lane]);
@@ -1477,6 +1590,9 @@ namespace VegaISA
         src.readSrc();
         pi.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (src[lane] < -256.0 || src[lane] > 256.0) {
@@ -1517,6 +1633,9 @@ namespace VegaISA
         src.readSrc();
         pi.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (src[lane] < -256.0 || src[lane] > 256.0) {
@@ -1553,6 +1672,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = ~src[lane];
@@ -1585,6 +1707,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = reverseBits(src[lane]);
@@ -1617,6 +1742,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = findFirstOneMsb(src[lane]);
@@ -1649,6 +1777,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = findFirstOne(src[lane]);
@@ -1681,6 +1812,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = firstOppositeSignBit(src[lane]);
@@ -1714,6 +1848,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
@@ -1752,6 +1889,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
@@ -1789,6 +1929,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemF64 int_part(0.0);
@@ -1827,6 +1970,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
@@ -1870,6 +2016,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
@@ -1926,8 +2075,8 @@ namespace VegaISA
 
         src.readSrc();
 
-        panic_if(isDPPInst(), "DPP unimplemented for v_mov_b64");
-        panic_if(isSDWAInst(), "SDWA unimplemented for v_mov_b64");
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
 
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
@@ -2359,6 +2508,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::pow(2.0, src[lane]);
@@ -2391,6 +2543,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::log2(src[lane]);
@@ -2423,6 +2578,9 @@ namespace VegaISA
 
         src.readSrc();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src[lane];
diff --git a/src/arch/amdgpu/vega/insts/vop2.cc b/src/arch/amdgpu/vega/insts/vop2.cc
index 55146711b6..f6eec253a3 100644
--- a/src/arch/amdgpu/vega/insts/vop2.cc
+++ b/src/arch/amdgpu/vega/insts/vop2.cc
@@ -67,6 +67,9 @@ namespace VegaISA
         src1.read();
         vcc.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane]
@@ -102,6 +105,8 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+
         if (isDPPInst()) {
             VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
             src0_dpp.read();
@@ -163,6 +168,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] - src1[lane];
@@ -198,6 +206,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] - src0[lane];
@@ -232,6 +243,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] * src1[lane];
@@ -266,6 +280,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (std::isnan(src0[lane]) ||
@@ -344,6 +361,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
@@ -378,6 +398,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemI64 tmp_src0
@@ -445,6 +468,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
@@ -481,6 +507,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fmin(src0[lane], src1[lane]);
@@ -515,6 +544,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fmax(src0[lane], src1[lane]);
@@ -548,6 +580,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::min(src0[lane], src1[lane]);
@@ -581,6 +616,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::max(src0[lane], src1[lane]);
@@ -614,6 +652,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::min(src0[lane], src1[lane]);
@@ -647,6 +688,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::max(src0[lane], src1[lane]);
@@ -682,6 +726,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
@@ -717,6 +764,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
@@ -751,6 +801,8 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         if (isSDWAInst()) {
             VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
             // use copies of original src0, src1, and vdst during selecting
@@ -826,6 +878,8 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+
         if (isDPPInst()) {
             VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
             src0_dpp.read();
@@ -886,6 +940,8 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         if (isSDWAInst()) {
             VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
             // use copies of original src0, src1, and dest during selecting
@@ -961,6 +1017,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] ^ src1[lane];
@@ -998,6 +1057,8 @@ namespace VegaISA
         src1.read();
         vdst.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+
         if (isDPPInst()) {
             VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
             src0_dpp.read();
@@ -1064,6 +1125,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fma(src0[lane], k, src1[lane]);
@@ -1103,6 +1167,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fma(src0[lane], src1[lane], k);
@@ -1141,6 +1208,8 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         if (isSDWAInst()) {
             VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
             // use copies of original src0, src1, and dest during selecting
@@ -1225,6 +1294,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] - src1[lane];
@@ -1265,6 +1337,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] - src0[lane];
@@ -1308,6 +1383,9 @@ namespace VegaISA
         src1.read();
         vcc.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] + src1[lane]
@@ -1355,6 +1433,9 @@ namespace VegaISA
         src1.read();
         vcc.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane]
@@ -1401,6 +1482,9 @@ namespace VegaISA
         src1.read();
         vcc.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane]
@@ -1598,6 +1682,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] + src1[lane];
@@ -1632,6 +1719,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] - src1[lane];
@@ -1667,6 +1757,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] - src0[lane];
@@ -1701,6 +1794,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] * src1[lane];
@@ -1735,6 +1831,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
@@ -1770,6 +1869,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] >> src0[lane];
@@ -1805,6 +1907,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] >> src0[lane];
@@ -1882,6 +1987,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::max(src0[lane], src1[lane]);
@@ -1915,6 +2023,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::max(src0[lane], src1[lane]);
@@ -1948,6 +2059,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::min(src0[lane], src1[lane]);
@@ -1981,6 +2095,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::min(src0[lane], src1[lane]);
@@ -2034,6 +2151,8 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         if (isSDWAInst()) {
             VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
             // use copies of original src0, src1, and dest during selecting
@@ -2108,6 +2227,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src0[lane] - src1[lane];
@@ -2141,6 +2263,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = src1[lane] - src0[lane];
@@ -2175,6 +2300,9 @@ namespace VegaISA
         src1.read();
         vdst.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
@@ -2209,6 +2337,9 @@ namespace VegaISA
         src1.read();
         vdst.read();
 
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = ~(src0[lane] ^ src1[lane]);

From 21f6e166b7f8fe9dcf9c23e4c890773ed6798a26 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 17 Jul 2024 14:57:52 -0700
Subject: [PATCH 4/4] arch-vega: Panic on SDWAB / DPP VOPC unimplemented

If SDWAB or DPP are used on a VOPC instruction and those are not
implemented, it is highly likely to be a problem for the application.
Rather than continue to execute and cause undefined behavior, exit the
simulation with a panic showing the line of the instruction causing the
issue.

Change-Id: Ib3f94df7445d068b26907470c1f733be16cd2fc2
---
 src/arch/amdgpu/vega/insts/vopc.cc | 390 +++++++++++++++++++++++++++++
 1 file changed, 390 insertions(+)

diff --git a/src/arch/amdgpu/vega/insts/vopc.cc b/src/arch/amdgpu/vega/insts/vopc.cc
index 0e1fb04f75..9361e68b67 100644
--- a/src/arch/amdgpu/vega/insts/vopc.cc
+++ b/src/arch/amdgpu/vega/insts/vopc.cc
@@ -74,6 +74,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
@@ -189,6 +192,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
@@ -304,6 +310,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
@@ -420,6 +429,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
@@ -1277,6 +1289,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -1311,6 +1326,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -1345,6 +1363,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -1379,6 +1400,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -1413,6 +1437,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (src0[lane] < src1[lane]
@@ -1448,6 +1475,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -1482,6 +1512,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (!std::isnan(src0[lane])
@@ -1517,6 +1550,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (std::isnan(src0[lane])
@@ -1552,6 +1588,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
@@ -1586,6 +1625,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]
@@ -1621,6 +1663,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
@@ -1655,6 +1700,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
@@ -1689,6 +1737,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -1723,6 +1774,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
@@ -1818,6 +1872,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -1854,6 +1911,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -1890,6 +1950,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -1926,6 +1989,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -1962,6 +2028,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (src0[lane] < src1[lane]
@@ -1999,6 +2068,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -2036,6 +2108,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (!std::isnan(src0[lane])
@@ -2074,6 +2149,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (std::isnan(src0[lane])
@@ -2111,6 +2189,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
@@ -2147,6 +2228,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]
@@ -2184,6 +2268,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
@@ -2220,6 +2307,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
@@ -2256,6 +2346,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] == src1[lane]) ? 1 : 0);
@@ -2292,6 +2385,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
@@ -2387,6 +2483,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -2421,6 +2520,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -2455,6 +2557,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -2489,6 +2594,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -2523,6 +2631,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (src0[lane] < src1[lane]
@@ -2558,6 +2669,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -2592,6 +2706,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (!std::isnan(src0[lane])
@@ -2627,6 +2744,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (std::isnan(src0[lane])
@@ -2662,6 +2782,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
@@ -2696,6 +2819,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]
@@ -2731,6 +2857,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
@@ -2765,6 +2894,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
@@ -2799,6 +2931,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -2833,6 +2968,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
@@ -2928,6 +3066,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -2964,6 +3105,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -3000,6 +3144,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -3036,6 +3183,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -3072,6 +3222,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (src0[lane] < src1[lane]
@@ -3109,6 +3262,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -3146,6 +3302,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (!std::isnan(src0[lane])
@@ -3184,6 +3343,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, (std::isnan(src0[lane])
@@ -3221,6 +3383,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
@@ -3257,6 +3422,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]
@@ -3294,6 +3462,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
@@ -3330,6 +3501,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
@@ -3366,6 +3540,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -3402,6 +3579,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
@@ -3495,6 +3675,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -3528,6 +3711,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -3561,6 +3747,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -3594,6 +3783,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -3627,6 +3819,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -3660,6 +3855,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -3749,6 +3947,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -3823,6 +4024,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -3856,6 +4060,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -3932,6 +4139,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -4024,6 +4234,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -4059,6 +4272,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -4094,6 +4310,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -4129,6 +4348,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -4164,6 +4386,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -4199,6 +4424,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -4294,6 +4522,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -4329,6 +4560,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -4364,6 +4598,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -4399,6 +4636,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -4434,6 +4674,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -4469,6 +4712,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -4561,6 +4807,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -4594,6 +4843,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -4627,6 +4879,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -4660,6 +4915,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -4693,6 +4951,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -4726,6 +4987,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -4815,6 +5079,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -4848,6 +5115,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -4881,6 +5151,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -4914,6 +5187,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -4947,6 +5223,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -4980,6 +5259,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -5072,6 +5354,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -5107,6 +5392,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -5142,6 +5430,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -5177,6 +5468,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -5212,6 +5506,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -5247,6 +5544,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -5342,6 +5642,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -5377,6 +5680,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -5412,6 +5718,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -5447,6 +5756,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -5482,6 +5794,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -5517,6 +5832,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -5609,6 +5927,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -5642,6 +5963,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -5675,6 +5999,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -5708,6 +6035,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -5741,6 +6071,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -5774,6 +6107,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -5863,6 +6199,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -5896,6 +6235,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -5929,6 +6271,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -5962,6 +6307,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -5995,6 +6343,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -6028,6 +6379,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -6120,6 +6474,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -6155,6 +6512,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -6190,6 +6550,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -6225,6 +6588,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -6260,6 +6626,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -6295,6 +6664,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
@@ -6390,6 +6762,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
@@ -6425,6 +6800,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
@@ -6460,6 +6838,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
@@ -6495,6 +6876,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
@@ -6530,6 +6914,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
@@ -6565,6 +6952,9 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);